Global well-being in 2025: A multidimensional analysis of mental, financial, and social health in 92 countries

APPENDIX

Published

15 December 2025

This appendix includes all of the execution steps used in analyzing the data, from preprocessing with the raw public data (A0) to statistical analysis (A1 to A23).

In order to reproduce these steps, it is necessary to place all of the following files in the same directory as 000_analysis_script.qmd. Moreover, a revn.lock file and a Docker image were provided to ensure that the analysis can be executed within the same software environment used by the authors.

To conduct the analyses (A1 to A23) without running the preprocessing steps, place 999_clean_data.rds in the same directory, run the Setup section, and start at any Analyses section. Each analysis section is independent and can be run separately.

Setup

Set working directory

# Set the working directory to the current folder
if (requireNamespace("rstudioapi") && rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
} else {
  setwd(dirname(knitr::current_input()))
}

Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(char = c(
  "MetBrewer",
  "ggridges",
  "metafor",
  "ggtext",
  "binom",
  "mgcv", # Generalized Additive Models
  "interactions", # Interaction plots
  "lsr", # effect size calculations
  "survey", # weighted analysis
    "corrplot", # Correlation plots
    "ggh4x", # Advanced ggplot2 facets
    "htmltools", # Create HTML content
    "sf", # Handle spatial data
    "rnaturalearth", # Obtain map data
    "rnaturalearthdata", # Obtain map data
    "gridExtra", # Arrange multiple plots
    "grid", # Arrange multiple plots
    "gtable", # Arrange multiple plots
    "ggplotify", # Convert plots to grobs
    "qualtRics", # Read files obtained through Qualtrics
    "readr", # Write csv files
    "readxl", # Read excel files
    "flextable", # Create Word documents
    "officer", # Create Word documents
    "dplyr", # Manipulate data during preprocessing
    "tidyr", # Manipulate data during preprocessing
    "stringr", # Manipulate strings during preprocessing
    "janitor", # Clean and manage data during preprocessing
    "ggplot2", # Create plots
    "ggfx", # Add drop shadow effect on elements in a plot
    "psych", # Conduct reliability tests
    "car", # Conduct Anova tests on models
    "emmeans", # Perform contrast analysis
    "lme4", # Run Linear Mixed Models
    "kableExtra", # Display tables in HTML format
    "sjPlot", # Generate advanced tables for models
    "report", # Generate advanced reporting for linear models
    "performance", # Generate advanced reporting for linear models
    "Hmisc", # Generate advanced reporting and weighted statistics
    "semTools", # For McDonald's omega
    "lavaan", # Needed for semTools
    "rmcorr", # Multilevel correlations
    "ggeffects", # Generate plots from marginal effects
    "tibble", # Data wrangling
    "purrr", # Data wrangling
    "forcats", # Data wrangling
    "see", # Data visualization
    "broom.mixed", # Data wrangling
    "showtext", # Custom fonts in plots
    "ggflags", # Country flags in plots
    "scales", # Scale functions for ggplot2
    "countrycode", # Convert country names to different coding schemes
    "cowplot",  # Combine multiple ggplots into one figure
    "sessioninfo", # Report session info
    "rlang",
    "visdat", # Visualise missing data
    "labelled", # Handle variable labels
    "sysfonts", # Custom fonts in plots
    "reactable", # Create interactive tables
    "weights", # Weighted statistics
    "leaflet", # Interactive maps
    "leaflet.extras",  # Interactive maps
    "leaflet.extras2",
    "lubridate", # Work with dates
    "stringr",  # Manipulate strings during preprocessing
    "cowplot"
  ))

Define global settings

options(
  # Remove scientific notation
  scipen = 999,
  width = 150,
  # Clean up dplyr messages
  dplyr.summarise.inform = FALSE)

# Set up theme for plots
sysfonts::font_add_google("Inter")
showtext::showtext_auto()

theme_gmh <- 
  ggplot2::theme_minimal(base_family = "Inter", base_size = 12) +
  ggplot2::theme(
    text = element_text(family = "Inter", colour = "#051520"),
    axis.text.y  = element_text(color = "#051520"),
    axis.text.x  = element_text(
      color = "#051520",
      margin = margin(t = 1),
      face = "bold"
    ),
    axis.title.x = element_text(color = "#051520", face = "bold"),
    axis.title.y = element_text(color = "#051520", face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.x = element_line(colour = "#051520", linewidth = 0.4),
    plot.margin = margin(6, 6, 6, 6),
    plot.subtitle = ggplot2::element_text(color = "#051520"),
    plot.background = ggplot2::element_rect(fill = "transparent", color = NA),
    panel.background = ggplot2::element_rect(fill = "transparent", color = NA),
    legend.background = ggplot2::element_rect(fill = "transparent", color = NA)
  )

ggplot2::theme_set(theme_gmh)

# Print variables in a tidy way
table_label <- function(col) {
  # extract what is after $ in dataframe$column
  name <- sub(".*\\$(.+)", "\\1", deparse(substitute(col)))
  # extract the label of the given column
  lab  <- attr(col, "label")
  # print header wih column name and label
  cat(sprintf("$%s\n%s\n", name, lab))
  # print table output with NA counts
  tbl <- table(col, useNA = "always")
  names(dimnames(tbl)) <- NULL
  print(tbl)
  # print the class of the column
  cat("Class:", paste(class(col), collapse = ", "), "\n")
}

# Print a pretty table
print_reactable <- function(data, sorted_col, width) {
  reactable::reactable(
    data,
    pagination = FALSE,
    height = 650,
    width = width,
    defaultSorted = sorted_col,
    defaultSortOrder = "asc",
    searchable = TRUE,
    striped = TRUE,
    compact = TRUE,
    highlight = TRUE,
    defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
    defaultColDef = reactable::colDef(
      vAlign = "center",
      headerVAlign = "bottom",
      class = "cell",
      headerClass = "header"
    )
  )
}

# Print pretty summaries
print_summ <- function(model, design, var, term) {
  format_p <- function(p) {
    if (p < 0.001) {
      return("< .001")
    }
    base::format(base::round(p, 3), nsmall = 3)
  }

  term_test <- survey::regTermTest(model, term)

  svy_resid <-
    update(design, .resid = stats::residuals(model, type = "response"))

  var_y <-
    survey::svyvar(stats::as.formula(paste0("~", var)), design = svy_resid)[1]

  var_e <- survey::svyvar(~.resid, design = svy_resid)[1]

  r2 <- 1 - (var_e / var_y)
  cohens_f <- base::sqrt(r2 / (1 - r2))
  percent_var_explained <- r2 * 100

  tibble::tibble(
    Ward_F = 
      base::format(base::round(base::as.numeric(term_test$Ftest[1]), 2), nsmall = 2),
    df1 = term_test$df,
    df2 = term_test$ddf,
    p = format_p(term_test$p),
    r2 = base::format(base::round(r2, 4), nsmall = 4),
    cohens_f = base::format(base::round(cohens_f, 4), nsmall = 4),
    percent_var_explained =
      base::format(base::round(percent_var_explained, 4), nsmall = 4)
    )
}

# Calculate weighted correlation
weighted_corr <- function(dat, var_x, var_y, multiple = FALSE) {
  if (!isTRUE(multiple)) {
  
  var_x <- rlang::ensym(var_x)
  var_y <- rlang::ensym(var_y)
  
  design <- survey::svydesign(
    ids = ~ 1,
    weights = ~ ps_weight,
    data = dat
  )
  
  est <- jtools::svycor(
    stats::as.formula(
      paste0("~", rlang::as_name(var_x), " + ", rlang::as_name(var_y))),
    design,
    sig.stats = TRUE,
    bootn = 1000,
    mean1 = TRUE
  )
  
  data.frame(
    r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
    t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
    p = dplyr::if_else(
      est$p.values[2] < 0.01, "<.001", 
      as.character(format(round(est$p.values[2], 3), nsmall = 3)))
  )
  
  } else {
  outcome_sym <- rlang::ensym(var_x)
  items_val <- rlang::eval_tidy(rlang::enquo(var_y))

  design <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = dat)

  results <- purrr::map_dfr(items_val, function(item_name) {
    f <- stats::as.formula(paste0("~", rlang::as_name(outcome_sym), " + ", item_name))
    est <- jtools::svycor(f, design, sig.stats = TRUE, bootn = 1000, mean1 = TRUE)

    r_val <- est$cors[2]
    t_val <- est$t.values[2]
    p_val <- est$p.values[2]
      
  data.frame(
    item = item_name,
    r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
    t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
    p = dplyr::if_else(
      est$p.values[2] < 0.01, "<.001", 
      as.character(format(round(est$p.values[2], 3), nsmall = 3)))
  )
    
  })
  return(results)
  }
}

# Define MPWB items and labels
mpwb_items <- c(
  "mpwb_positive_relationships",
  "mpwb_meaning",
  "mpwb_competence",
  "mpwb_engagement",
  "mpwb_self_esteem",
  "mpwb_optimism",
  "mpwb_positive_emotion",
  "mpwb_emotional_stability",
  "mpwb_resilience",
  "mpwb_vitality"
)

mpwb_labels <- c(
  mpwb_positive_relationships = "Positive relationships",
  mpwb_meaning = "Meaning",
  mpwb_competence = "Competence",
  mpwb_engagement = "Engagement",
  mpwb_self_esteem = "Self-esteem",
  mpwb_optimism = "Optimism",
  mpwb_positive_emotion = "Positive emotion",
  mpwb_emotional_stability = "Emotional stability",
  mpwb_resilience = "Resilience",
  mpwb_vitality = "Vitality"
)

phq4_items <- c("phq_interest", "phq_down", "gad_anxious", "gad_worry")

# Define EU countries
eu_countries <- c(
  "Austria",
  "Belgium",
  "Bulgaria",
  "Croatia",
  "Cyprus",
  "Czech Republic",
  "Denmark",
  "Estonia",
  "Finland",
  "France",
  "Germany",
  "Greece",
  "Hungary",
  "Ireland",
  "Italy",
  "Latvia",
  "Netherlands",
  "Poland",
  "Portugal",
  "Romania",
  "Slovakia",
  "Slovenia",
  "Spain",
  "Sweden"
)

# List of countries whose weight scores were replaced by 1.
flagged_countries <- 
  c("Moldova", "Romania", "Nigeria", "Montenegro", "Angola",
    "Morocco", "Uruguay", "Paraguay", "Greece", "Iran",
    "Hungary", "Kosovo", "Yemen", "Chile", "Uganda")

Load data

The data collection began on June 2, 2025, with a soft-launch phase. The survey’s time zone was set to New York City. Due to time zone differences, some responses show a date of June 1, 2025, even though it was already June 2 in the collaborators’ local time. Some collaborators were residing in countries different from their target country.

# Raw public dataset
df_pub_raw <- base::readRDS("999_public_data.rds")

# View number of rows in the raw dataset
nrow(df_pub_raw)
[1] 68311
# Cleaned dataset
df_gmh <- base::readRDS("999_cleaned_data.rds")

# Create general design
svy <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_gmh)

# View number of rows in the cleaned dataset
nrow(df_gmh)
[1] 53799
# Codebook
codebook <- readxl::read_excel(
  path = "222_codebook.xlsx",
  sheet = "df_cleaned",
  skip = 1,
  col_names = TRUE
)

# TODO 
# Verify that items regarding individual location are being removed in the black box
# Think if duration adjusted should be 21 + n_items_after or 20 + n_items_after
# Add skimr::skim(df_pub) once the data is cleaned
# Add codebook at the end
# Are income_orig_cat_10 and 11 necessary? If we keep then add ordered factor
# Run a major double checking of financial cleaning variables vs- orig
# add the list of countries were we asked "use digits 0-9" to income section.
# Ask how zimbabwe transformation values were obtained
# make sure the tabs "all together" match the responsesids in the other tabs
# clean history from google sheets. 
# make sure that assessment fin does not contain excluded participants.
# use kable to print dataframe added and allow search and so on
# check if any packages is not being used in the load package section
# update max section in the intro
# save all tables and all fig
# run assumption checks for all models
# rename all weighted_n calculated as sums to sum_weight. and calculate effective kirs as weighted_n.

# To correct the anticipated but substantial imbalances in non-representative samples, within-country propensity weighting was applied using national population benchmarks. Expected population proportions for age, gender, and educational attainment (25+) using sex and age estimates from the United States Census Bureau’s International Database 65, educational attainment estimates sourced using the UNESCO data browser 66, some countries required specific equivalent sources (see Appendix A21 for full details) were used to reweight responses using the WeightIt (WeightIt, v1.5.0) framework 67 via gradient boosted trees with stabilized weights  for robustness (see Appendix A21). 

A0. Data Preprocessing

A0.1. Cleaning the dataset

Rename columns

df_pub <- df_pub_raw |>
  dplyr::rename(
    duration_sec = `Duration (in seconds)`,
    
    mpwb_competence = Q5,
    mpwb_emotional_stability = Q7,
    mpwb_engagement = Q9,
    mpwb_meaning = Q11,
    mpwb_optimism = Q13,
    mpwb_positive_emotion = Q15,
    mpwb_positive_relationships = Q17,
    mpwb_resilience = Q19,
    mpwb_self_esteem = Q21,
    mpwb_vitality = Q23,
    
    life_satisfaction = Q29,
    income_orig = Q31,
    income_text_orig = Q31_10_TEXT,
    household_size = Q32,
    birth_year_orig = Q25,
    sex_orig = Q26,
    education_orig = Q27,
    employment_orig = Q28,
    ethnicity_citizenship_orig = Q30,
    assets_orig = Q34,
    debts_orig = Q33,
    bot_check = Q43,
    followup = Q35,
    
    phq_interest = Q36_1,
    phq_down = Q36_2,
    gad_anxious = Q36_3,
    gad_worry = Q36_4,
    
    childhood_SES = Q37,
    fin_outlook = Q38,
    fin_outlook_conf = Q39,
    attention_care = Q40,
    work_arrangement = Q41
  ) |>
  dplyr::relocate(Q_Language, .after = UserLanguage) |>
  
  # Overview of the data
  dplyr::glimpse(width = 100)
Rows: 68,311
Columns: 45
$ StartDate                   <dttm> 2025-06-01 07:14:43, 2025-06-01 07:33:44, 2025-06-01 19:24:40…
$ EndDate                     <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:06, 2025-06-01 19:30:50…
$ Status                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Progress                    <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10…
$ duration_sec                <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, …
$ Finished                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RecordedDate                <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:07, 2025-06-01 19:30:52…
$ ResponseId                  <chr> "R_2i29tTIFUyYilqv", "R_2nemeLi6AnL1uNP", "R_3LqMY0lbugweTSh",…
$ UserLanguage                <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ Q_Language                  <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ mpwb_competence             <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability    <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement             <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning                <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism               <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion       <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem            <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality               <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ life_satisfaction           <dbl> 7, 6, 9, 8, 5, 8, 7, 10, 8, 10, 0, 8, 6, 9, 6, 5, 10, 8, 7, 8,…
$ income_orig                 <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, …
$ income_text_orig            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ household_size              <dbl> 3, 5, 1, 4, 5, 4, 2, 4, 2, 1, 1, 6, 5, 6, 3, 4, 10, 7, 1, 12, …
$ birth_year_orig             <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975"…
$ sex_orig                    <dbl> 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2,…
$ education_orig              <dbl> 6, 6, 5, 8, 6, 3, 7, 5, 7, 6, 5, 8, 5, 7, 7, 6, 5, 6, 5, 4, 7,…
$ employment_orig             <chr> "3", "3", "3", "3", "8", "2,8", "3", "3", "1", "6", "8", "3", …
$ ethnicity_citizenship_orig  <chr> "3,6,10", "1,10", "5,10", "5,10", "3,10", "5,10", "1,10", "1,1…
$ assets_orig                 <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,00…
$ debts_orig                  <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.0…
$ bot_check                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ followup                    <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1,…
$ phq_interest                <dbl> NA, 2, NA, 2, 3, 6, 1, NA, 2, 1, NA, 2, NA, 1, 2, 2, NA, 1, 2,…
$ phq_down                    <dbl> NA, 3, NA, 1, 3, 4, 1, NA, 1, 1, NA, 2, NA, 2, 2, 2, NA, 1, 2,…
$ gad_anxious                 <dbl> NA, 2, NA, 2, 3, 7, 2, NA, 2, 1, NA, 3, NA, 2, 3, 2, NA, 1, 1,…
$ gad_worry                   <dbl> NA, 1, NA, 2, 3, 7, 1, NA, 1, 1, NA, 3, NA, 2, 3, 2, NA, 1, 2,…
$ childhood_SES               <dbl> NA, 4, NA, 4, 2, 4, 1, NA, 1, 4, NA, 2, NA, 4, 4, 3, NA, 3, 4,…
$ fin_outlook                 <dbl> NA, 3, NA, 4, 4, 5, 5, NA, 5, 4, NA, 5, NA, 4, 5, 5, NA, 4, 5,…
$ fin_outlook_conf            <dbl> NA, 10, NA, 8, 8, 10, 10, NA, 10, 8, NA, 8, NA, 9, 7, 8, NA, 1…
$ attention_care              <dbl> NA, 5, NA, 5, 7, 4, 5, NA, 5, 6, NA, 5, NA, 5, 6, 5, NA, 5, 4,…
$ work_arrangement            <dbl> NA, 4, NA, 3, NA, 1, 2, NA, 5, NA, NA, 1, NA, 1, 3, 4, NA, 1, …
$ br                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ bs                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ CoreMPWB_DO                 <chr> "Q4|Q23|Q6|Q21|Q8|Q15|Q10|Q19|Q12|Q9|Q14|Q5|Q16|Q7|Q18|Q17|Q20…

Identification of the Countries

# Sanity check: View the counts of language
table_label(df_pub$UserLanguage)
$UserLanguage
User Language
 AM-ARM  AM-ETH  AR-ARE  AR-BHR  AR-DZA  AR-EGY  AR-KWT  AR-LBN  AR-MAR  AR-OMN  AR-QAT  AR-SAU  AR-TCD  AR-YEM  BG-BGR  BN-BGD  BS-BIH CNR-MNE 
    334     303      66     100     203     322     106     416     302     520     503     296       7     577     393     536     642     358 
 CS-CZE  DA-DNK  DE-AUT  DE-CHE  DE-DEU  EL-CYP  EL-GRC      EN  EN-ARE  EN-AUS  EN-BHR  EN-CAN  EN-EGY  EN-EST  EN-ETH  EN-GBR  EN-GEN  EN-GEO 
    267     338     685     452    1008     218     532    5549     270     605     111     535     547       9     249     852      69      54 
 EN-HKG  EN-HUN  EN-IDN  EN-IND  EN-IRL  EN-KOR  EN-KWT  EN-MNG  EN-MYS  EN-NGA  EN-NLD  EN-NOR  EN-PAK  EN-PHL  EN-QAT  EN-SGP  EN-UGA  EN-YEM 
     17       6      12     921     461      11     209      40     203     721     161      26     347    2280      23     298     332       3 
 EN-ZAF  EN-ZMB  EN-ZWE  ES-ARG  ES-BOL  ES-CHL  ES-ECU  ES-ESP  ES-MEX  ES-PER  ES-PRY  ES-URY   ES-US  ET-EST  FA-IRN  FI-FIN FIL-PHL  FR-BEL 
    279      34     169     769     341     240    1075     729    1164    1031     205     815     159    2393     292     275    1276      70 
 FR-CAN  FR-CHE  FR-FRA  FR-MDG  FR-SEN  FR-TCD  HE-ISR  HI-IND  HR-HRV  HU-HUN  ID-IDN  IT-CHE  IT-ITA  JA-JPN  KA-GEO  KK-KAZ  KO-KOR  KY-KGZ 
    339     292    1175     169     211     185     437     706     455     729    1489      79     566     549     450     131     481     166 
 LV-LVA  MK-MKD  MN-MNG  MS-MYS  NL-BEL  NL-NLD  NO-NOR  PL-POL  PT-AGO  PT-BRA  PT-MOZ  PT-PRT  PT-TLS  RO-MDA  RO-ROU  RU-KAZ  RU-KGZ  RU-RUS 
   1023     268     327     613     261     287     483    1288     329    2094     154     579     277     511     861     656     209    1322 
 RU-UZB  SK-SVK  SL-SVN  SN-ZWE SQI-ALB SQI-XKX  SR-SRB  SR-XKX  SV-SWE  TH-THA  TR-TUR  UK-UKR  UR-PAK  UZ-UZB  ZH-CHN  ZH-HKG  ZH-TWN    <NA> 
    119     724     746     106    2284    1371     420       2    1149     440     682     749     160     543    2523     220     201       0 
Class: character 
# Create column with country names mapped from UserLanguage
country_map <- c(
  "SQI-ALB" = "Albania",
  "AR-DZA" = "Algeria",
  "PT-AGO" = "Angola",
  "ES-ARG" = "Argentina",
  "AM-ARM" = "Armenia",
  "EN-AUS" = "Australia",
  "DE-AUT" = "Austria",
  "AR-BHR" = "Bahrain",
  "EN-BHR" = "Bahrain",
  "BN-BGD" = "Bangladesh",
  "FR-BEL" = "Belgium",
  "NL-BEL" = "Belgium",
  "ES-BOL" = "Bolivia",
  "BS-BIH" = "Bosnia-Herzegovina",
  "PT-BRA" = "Brazil",
  "BG-BGR" = "Bulgaria",
  "EN-CAN" = "Canada",
  "FR-CAN" = "Canada",
  "AR-TCD" = "Chad",
  "FR-TCD" = "Chad",
  "ES-CHL" = "Chile",
  "ZH-CHN" = "China",
  "HR-HRV" = "Croatia",
  "EL-CYP" = "Cyprus",
  "CS-CZE" = "Czech Republic",
  "DA-DNK" = "Denmark",
  "ES-ECU" = "Ecuador",
  "AR-EGY" = "Egypt",
  "EN-EGY" = "Egypt",
  "EN-EST" = "Estonia",
  "ET-EST" = "Estonia",
  "AM-ETH" = "Ethiopia",
  "EN-ETH" = "Ethiopia",
  "FR-FRA" = "France",
  "FI-FIN" = "Finland",
  "EN-GEO" = "Georgia",
  "KA-GEO" = "Georgia",
  "DE-DEU" = "Germany",
  "EL-GRC" = "Greece",
  "EN-HKG" = "Hong Kong",
  "ZH-HKG" = "Hong Kong",
  "EN-HUN" = "Hungary",
  "HU-HUN" = "Hungary",
  "EN-IND" = "India",
  "HI-IND" = "India",
  "ID-IDN" = "Indonesia",
  "EN-IDN" = "Indonesia",
  "FA-IRN" = "Iran",
  "EN-IRL" = "Ireland",
  "HE-ISR" = "Israel",
  "IT-ITA" = "Italy",
  "JA-JPN" = "Japan",
  "KK-KAZ" = "Kazakhstan",
  "RU-KAZ" = "Kazakhstan",
  "EN-KOR" = "Republic of Korea",
  "KO-KOR" = "Republic of Korea",
  "SQI-XKX" = "Kosovo",
  "SR-XKX" = "Kosovo",
  "AR-KWT" = "Kuwait",
  "EN-KWT" = "Kuwait",
  "KY-KGZ" = "Kyrgyzstan",
  "RU-KGZ" = "Kyrgyzstan",
  "LV-LVA" = "Latvia",
  "AR-LBN" = "Lebanon",
  "MK-MKD" = "North Macedonia",
  "FR-MDG" = "Madagascar",
  "MS-MYS" = "Malaysia",
  "EN-MYS" = "Malaysia",
  "ES-MEX" = "Mexico",
  "RO-MDA" = "Moldova",
  "EN-MNG" = "Mongolia",
  "MN-MNG" = "Mongolia",
  "CNR-MNE" = "Montenegro",
  "AR-MAR" = "Morocco",
  "PT-MOZ" = "Mozambique",
  "NL-NLD" = "Netherlands",
  "EN-NLD" = "Netherlands",
  "EN-NGA" = "Nigeria",
  "EN-NOR" = "Norway",
  "NO-NOR" = "Norway",
  "AR-OMN" = "Oman",
  "UR-PAK" = "Pakistan",
  "EN-PAK" = "Pakistan",
  "ES-PRY" = "Paraguay",
  "ES-PER" = "Peru",
  "EN-PHL" = "Philippines",
  "FIL-PHL" = "Philippines",
  "PL-POL" = "Poland",
  "PT-PRT" = "Portugal",
  "AR-QAT" = "Qatar",
  "EN-QAT" = "Qatar",
  "RO-ROU" = "Romania",
  "RU-RUS" = "Russia",
  "AR-SAU" = "Saudi Arabia",
  "FR-SEN" = "Senegal",
  "SR-SRB" = "Serbia",
  "EN-SGP" = "Singapore",
  "SK-SVK" = "Slovakia",
  "SL-SVN" = "Slovenia",
  "EN-ZAF" = "South Africa",
  "ES-ESP" = "Spain",
  "SV-SWE" = "Sweden",
  "FR-CHE" = "Switzerland",
  "DE-CHE" = "Switzerland",
  "IT-CHE" = "Switzerland",
  "ZH-TWN" = "Taiwan",
  "TH-THA" = "Thailand",
  "PT-TLS" = "Timor-Leste",
  "TR-TUR" = "Türkiye",
  "EN-UGA" = "Uganda",
  "UK-UKR" = "Ukraine",
  "AR-ARE" = "UAE",
  "EN-ARE" = "UAE",
  "EN-GBR" = "UK",
  "EN" = "USA",
  "ES-US"  = "USA",
  "ES-URY" = "Uruguay",
  "RU-UZB" = "Uzbekistan",
  "UZ-UZB" = "Uzbekistan",
  "AR-YEM" = "Yemen",
  "EN-YEM" = "Yemen",
  "EN-ZMB" = "Zambia",
  "EN-ZWE" = "Zimbabwe",
  "SN-ZWE" = "Zimbabwe",
  "EN-GEN" = "Global"
)

df_pub <- df_pub |>
  dplyr::mutate(
    
    # Identify country based on UserLanguage
    country = country_map[UserLanguage],
    
    # Transform UserLanguage to ISO codes
    # (the last three characters identify the ISO3 code except USA)
    iso3 = stringr::str_extract(UserLanguage, "[A-Z]{3}$"),
    
    # Clean the code for the USA
    iso3 = dplyr::case_when(
      UserLanguage == "EN" ~ "USA",
      UserLanguage == "ES-US" ~ "USA",
      UserLanguage == "EN-GEN" ~ NA_character_,
      TRUE ~ iso3
      ),
    # Convert ISO3 to ISO2
    iso2 = countrycode::countrycode(
      iso3,
      origin = "iso3c",
      destination = "iso2c",
      custom_match = c("XKX" = "XK"))
  ) |>
  dplyr::relocate(country, iso2, iso3, .after = UserLanguage)

# Sanity check: Cross-tab of countries by language
df_pub |>
  dplyr::count(country, iso2, iso3, sort = TRUE) |>
  dplyr::filter(!is.na(country)) |> 
  print_reactable(sorted_col = "country", width = 500)
# Cleanup
rm(country_map)

Global Version Processing

A global version of the survey was created to ensure people from countries that weren’t specifically targeted in this study or whose native languages weren’t provided could still take part. This version didn’t have any changes made for specific countries. There was only an open-text field for the income item, and all financial items asked for values in USD.

# Identify country and citizenship
gen_ident <- 
  readr::read_csv("111_generic_version_country.csv", show_col_types = FALSE) |> 
  dplyr::glimpse(width = 100)
Rows: 69
Columns: 2
$ ResponseId  <chr> "R_4CJBLtS3qvvRTf7", "R_2EGKdy6ce2zvQls", "R_8f1msPTljX0SGpw", "R_7f1bVmdQG7qh…
$ country_gen <chr> "Australia", "Austria", "Austria", "Bangladesh", "Bangladesh", "Bangladesh", "…
nrow(df_pub)
[1] 68311
df_pub <- df_pub |>
  dplyr::left_join(gen_ident, by = "ResponseId") |>
  dplyr::relocate(country_gen, .after = country)

nrow(df_pub)
[1] 68311
# Sanity check: View the country counts of global version participants
# It was not possible to identify the country for one participant
df_pub |> dplyr::filter(UserLanguage == "EN-GEN") |>
  dplyr::group_by(country_gen) |> 
  dplyr::summarise(n = dplyr::n()) |>
  base::print(n = Inf)
# A tibble: 34 × 2
   country_gen                          n
   <chr>                            <int>
 1 Afghanistan                          1
 2 Australia                            1
 3 Austria                              2
 4 Bangladesh                           4
 5 Belgium                              7
 6 Bhutan                               2
 7 Colombia                             3
 8 Democratic Republic of the Congo     1
 9 Dominican Republic                   1
10 Finland                              1
11 France                               3
12 Germany                              2
13 Guatemala                            1
14 Honduras                             1
15 India                                2
16 Italy                                1
17 Kenya                                1
18 Korea                                1
19 Lebanon                              7
20 Namibia                              1
21 Nepal                                1
22 New Zealand                          2
23 Norway                               1
24 Oman                                 5
25 Pakistan                             1
26 Philippines                          2
27 Sri Lanka                            1
28 Sweden                               1
29 Thailand                             1
30 UAE                                  4
31 UK                                   2
32 Zambia                               3
33 Zimbabwe                             1
34 <NA>                                 1
# Cleanup
rm(gen_ident)

Exclusion of Countries with Small Sample Sizes

We excluded the Global version and Zambia because the sample sizes were not sufficiently large. The Global version does not have the country-specific changes that were made in the target countries, consequently those answers can’t be compared. Zambia is not included because it only has 34 participants, which is less than the 120 required.

# View countries with less than 120 participants
df_pub |> 
  dplyr::group_by(country) |> 
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::filter(n < 120) |>
  dplyr::arrange(n)
# A tibble: 2 × 2
  country     n
  <chr>   <int>
1 Zambia     34
2 Global     69
# Exclude Global version and Zambia
nrow(df_pub)
[1] 68311
df_pub <- df_pub |> 
  dplyr::filter(UserLanguage != "EN-GEN", UserLanguage != "EN-ZMB") |>
  dplyr::select(-country_gen)

nrow(df_pub)
[1] 68208

Location Validation

location <-
  readr::read_csv("111_administrative_location.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 68,208
Columns: 7
$ ResponseId   <chr> "R_4rOh5csuvsUlmsF", "R_9Hk3KD5bE28n9bn", "R_8plovBuEUJfYQRO", "R_5miAsDI8Pi7…
$ loc_country  <chr> "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", …
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_admin_1  <chr> "Yerevan", "Yerevan", "Yerevan", "Syunik", "Yerevan", "Yerevan", "Yerevan", "…
$ loc_admin_2  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ lat          <dbl> 40.18720, 40.18720, 40.18720, 39.50899, 40.18720, 40.18720, 40.18720, 40.1872…
$ long         <dbl> 44.51521, 44.51521, 44.51521, 46.34389, 44.51521, 44.51521, 44.51521, 44.5152…
# Merge location validation data into main dataset
nrow(df_pub)
[1] 68208
df_pub <- df_pub |> 
  dplyr::left_join(location, by = "ResponseId") |>
  dplyr::relocate(
    loc_resident,
    loc_country,
    loc_admin_1,
    loc_admin_2,
    lat,
    long,
    .after = Q_Language
  )

# Sanity check: Number of rows should remain the same
nrow(df_pub)
[1] 68208
# Sanity check: How many missing location validations are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(loc_resident)))
[1] 0
# Sanity check: How many missing latitudes are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(lat) & !is.na(loc_country)))
[1] 0
# Sanity check: View the counts of location validation
df_pub |> dplyr::filter(loc_resident == 0) |> 
  dplyr::group_by(country, loc_resident) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity check: View the counts of administrative level units per country
df_pub |> dplyr::filter(loc_resident == 1) |>
  tidyr::pivot_longer(
    cols = c(loc_admin_1, loc_admin_2),
    names_to = "admin_level",
    values_to = "value"
  ) |>
  dplyr::summarise(
    unique_n = dplyr::n_distinct(value, na.rm = TRUE),
    .by = c(country, admin_level)
  ) |>
  print_reactable(sorted_col = "country", width = 500)
# Cleanup
rm(location)

Merge Sponsored Participants from Ireland’s Team

All participants from the Ireland’s sponsored dataset completed the survey and only the mandatory items were included. Some variables had different options than those in the main dataset.

# Merge the sponsored Irish participants
df_irl_raw <-
  readr::read_csv("999_irish_sponsored_public.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 1,200
Columns: 21
$ utcdateandtime                <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", …
$ participantprivateid          <dbl> 13767545, 13767552, 13767549, 13767554, 13767544, 13767547, …
$ branchpbkg                    <chr> "male", "female", "female", "female", "female", "female", "m…
$ qid12object4response          <chr> "Agree", "Strongly Agree", "Strongly Disagree", "Agree", "St…
$ qid13object6response          <chr> "Strongly Agree", "Agree", "Agree", "Disagree", "Agree", "Ag…
$ qid14object8response          <chr> "Strongly Agree", "Strongly Agree", "Strongly Disagree", "Ag…
$ qid15object9response          <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid16object10response         <chr> "Agree", "Strongly Agree", "Agree", "Disagree", "Agree", "Di…
$ qid17object11response         <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid18object12response         <chr> "Agree", "Absolutely Agree", "Strongly Agree", "Strongly Agr…
$ qid19object13response         <chr> "Agree", "Agree", "Agree", "Strongly Agree", "Strongly Agree…
$ qid20object14response         <chr> "Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree…
$ qid20object15response         <chr> "Agree", "Agree", "Neutral", "Disagree", "Agree", "Disagree"…
$ qid29object17response         <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7,…
$ born_locationobject5response  <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__ot…
$ educationobject8response      <chr> "Leaving Certificate", "Degree", "Master's", "Technical or V…
$ employmentobject9response     <chr> "Employed full-time", "Employed full-time", "Seeking Employm…
$ incomeobject12quantised       <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5,…
$ incomeobject12response        <chr> "€67,001 - €85,000", "€67,001 - €85,000", "€85,001 - €105,00…
$ P1ageobject377Response        <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, …
$ numhouseholdobject375Response <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, …
df_irl <- df_irl_raw |>
  dplyr::transmute(
    StartDate_irl = utcdateandtime,
    ResponseId = as.character(participantprivateid),
    sex_irl = branchpbkg,
    mpwb_competence = qid12object4response,
    mpwb_emotional_stability = qid13object6response,
    mpwb_engagement = qid14object8response,
    mpwb_meaning = qid15object9response,
    mpwb_optimism = qid16object10response,
    mpwb_positive_emotion = qid17object11response,
    mpwb_positive_relationships = qid18object12response,
    mpwb_resilience = qid19object13response,
    mpwb_self_esteem = qid20object14response,
    mpwb_vitality = qid20object15response,
    life_satisfaction = qid29object17response,
    ethnicity_citizenship_irl = born_locationobject5response,
    # The education options are slightly different from the version
    # used for Ireland non-sponsored participants
    education_irl = educationobject8response,
    # The employment options are slightly different from the version
    # used for Ireland non-sponsored participants
    employment_irl = employmentobject9response,
    # The income brackets are slightly different from the version
    # used for Ireland non-sponsored participants
    income_irl = incomeobject12quantised,
    household_size = numhouseholdobject375Response,
    age = P1ageobject377Response
  ) |>
  dplyr::mutate(

    sex_orig = dplyr::case_when(
      sex_irl == "male" ~ 1,
      sex_irl == "female" ~ 2,
      sex_irl == "other" ~ 3,
      TRUE ~ NA_integer_
    ),

    ethnicity_citizenship_orig = dplyr::case_when(
        # The only options given were "Ireland" and "__other"
        ethnicity_citizenship_irl == "Ireland" ~ "10",
        ethnicity_citizenship_irl == "__other" ~ "11",
        TRUE ~ NA_character_
      ),
    
    education_orig = dplyr::case_when(
      education_irl == "Less than Junior (Inter) Cert" ~ 1,
      education_irl == "Junior (Inter) Certificate or Equivalent" ~ 2,
      education_irl == "Leaving Certificate" ~ 3,
      education_irl == "Technical or Vocational Certificate" ~ 4,
      education_irl == "Diploma" ~ 5,
      education_irl == "Degree" ~ 6,
      education_irl == "Master's" ~ 7,
      education_irl == "Doctorate" ~ 8,
      TRUE ~ NA_integer_
    ),

    employment_orig = dplyr::case_when(
      employment_irl == "Employed full-time" ~ "3",
      employment_irl == "Employed part-time" ~ "4",
      employment_irl == "Student" ~ "1",
      employment_irl == "Seeking Employment/Unemployed" ~ "8",
      employment_irl == "Homemaker/Carer" ~ "7",
      employment_irl == "Unable to Work" ~ "9",
      employment_irl == "Retired" ~ "6",
      # The option below is not in the original coding scheme
      employment_irl == "Self-employed" ~ NA_character_,
      TRUE ~ NA_character_
    ),

    income_orig = dplyr::if_else(
      # The option 10 = "Prefer not to say" is recoded to NA
      income_irl == 10,
      NA_integer_,
      income_irl
    ),

    Q_Language = "EN-IRL-sponsored",
    UserLanguage = "EN-IRL-sponsored",
    iso3 = "IRL",
    iso2 = "IE",
    country = "Ireland",
    loc_resident = 1,
    loc_country = "Ireland",
    lat = 53.3861632,
    long = -10.5940283,
    irl = 1

  ) |>
  # We need to recode the MPWB items from text to numerical
  dplyr::mutate(
    dplyr::across(
      dplyr::all_of(mpwb_items),
      ~ as.numeric(base::factor(
        .,
        levels = c(
          # first level will be coded as 1
          "Absolutely Disagree",
          # second level will be coded as 2, etc.
          "Strongly Disagree",
          "Disagree",
          "Neutral",
          "Agree",
          "Strongly Agree",
          "Absolutely Agree"
          )
  )))) |>
  dplyr::glimpse(width = 100)
Rows: 1,200
Columns: 35
$ StartDate_irl               <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", "0…
$ ResponseId                  <chr> "13767545", "13767552", "13767549", "13767554", "13767544", "1…
$ sex_irl                     <chr> "male", "female", "female", "female", "female", "female", "mal…
$ mpwb_competence             <dbl> 5, 6, 2, 5, 6, 5, 5, 5, 4, 5, 4, 4, 6, 7, 7, 5, 6, 6, 5, 7, 2,…
$ mpwb_emotional_stability    <dbl> 6, 5, 5, 3, 5, 5, 6, 5, 3, 5, 4, 5, 5, 6, 7, 5, 5, 7, 6, 5, 2,…
$ mpwb_engagement             <dbl> 6, 6, 2, 5, 5, 5, 6, 3, 5, 5, 5, 5, 5, 7, 7, 6, 5, 4, 5, 4, 2,…
$ mpwb_meaning                <dbl> 6, 7, 5, 5, 5, 4, 5, 5, 4, 5, 2, 5, 5, 7, 7, 5, 7, 7, 5, 7, 3,…
$ mpwb_optimism               <dbl> 5, 6, 5, 3, 5, 3, 6, 5, 4, 5, 3, 5, 5, 7, 7, 5, 7, 7, 6, 7, 3,…
$ mpwb_positive_emotion       <dbl> 6, 7, 5, 5, 5, 4, 6, 5, 4, 5, 3, 5, 5, 5, 6, 5, 6, 7, 7, 5, 3,…
$ mpwb_positive_relationships <dbl> 5, 7, 6, 6, 5, 5, 7, 5, 5, 6, 3, 6, 5, 4, 6, 6, 5, 6, 6, 5, 2,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 6, 4, 5, 5, 5, 4, 5, 4, 5, 5, 5, 6, 6, 7, 6, 6, 3,…
$ mpwb_self_esteem            <dbl> 5, 6, 5, 5, 6, 4, 5, 5, 4, 5, 3, 5, 5, 1, 4, 5, 6, 7, 6, 5, 2,…
$ mpwb_vitality               <dbl> 5, 5, 4, 3, 5, 3, 5, 5, 4, 5, 3, 4, 4, 1, 4, 5, 6, 7, 4, 4, 2,…
$ life_satisfaction           <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7, 2…
$ ethnicity_citizenship_irl   <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__othe…
$ education_irl               <chr> "Leaving Certificate", "Degree", "Master's", "Technical or Voc…
$ employment_irl              <chr> "Employed full-time", "Employed full-time", "Seeking Employmen…
$ income_irl                  <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ household_size              <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, 3,…
$ age                         <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, 34…
$ sex_orig                    <dbl> 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2,…
$ ethnicity_citizenship_orig  <chr> "10", "10", "10", "10", "10", "11", "10", "10", "10", "10", "1…
$ education_orig              <dbl> 3, 6, 7, 4, 6, 6, 7, 4, 5, 6, 7, 3, 7, 7, 7, 6, 3, 7, 6, 3, 6,…
$ employment_orig             <chr> "3", "3", "8", "3", "3", "7", "6", "3", "7", "3", "7", "4", "3…
$ income_orig                 <dbl> 6, 6, 7, 3, 7, 2, 5, 6, NA, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ Q_Language                  <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ UserLanguage                <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ iso3                        <chr> "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL",…
$ iso2                        <chr> "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "I…
$ country                     <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ loc_resident                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_country                 <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ lat                         <dbl> 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53…
$ long                        <dbl> -10.59403, -10.59403, -10.59403, -10.59403, -10.59403, -10.594…
$ irl                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Sanity check: View the counts of each option
base::table(df_irl$mpwb_competence, df_irl_raw$qid12object4response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  18     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                22    0
  3                   0                   0     0       59       0              0                 0    0
  4                   0                   0     0        0     262              0                 0    0
  5                   0                   0   498        0       0              0                 0    0
  6                   0                   0     0        0       0            229                 0    0
  7                 112                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_emotional_stability, df_irl_raw$qid13object6response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  20     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                39    0
  3                   0                   0     0      148       0              0                 0    0
  4                   0                   0     0        0     231              0                 0    0
  5                   0                   0   490        0       0              0                 0    0
  6                   0                   0     0        0       0            192                 0    0
  7                  80                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_engagement, df_irl_raw$qid14object8response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  11     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                18    0
  3                   0                   0     0      111       0              0                 0    0
  4                   0                   0     0        0     362              0                 0    0
  5                   0                   0   480        0       0              0                 0    0
  6                   0                   0     0        0       0            157                 0    0
  7                  61                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_meaning, df_irl_raw$qid15object9response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  24     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                26    0
  3                   0                   0     0       82       0              0                 0    0
  4                   0                   0     0        0     264              0                 0    0
  5                   0                   0   479        0       0              0                 0    0
  6                   0                   0     0        0       0            213                 0    0
  7                 112                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_optimism, df_irl_raw$qid17object11response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  21     1        8       3              2                 4    0
  2                   0                   2     0       15      13              1                 8    0
  3                   0                   0    21       36      45              2                 4    0
  4                   4                   2   111       20     144             17                 2    0
  5                  15                   0   302        7      47             51                 1    0
  6                  26                   0    52        0       9             95                 0    0
  7                  66                   0    20        0       2             20                 1    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_positive_emotion, df_irl_raw$qid15object9response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   1                  15     1        2       0              1                 5    0
  2                   0                   0     5        2       5              2                 6    0
  3                   0                   7    17       22      30              3                 7    0
  4                   4                   1    85       40     112             14                 7    0
  5                  20                   1   293       14      99             79                 1    0
  6                  25                   0    64        2      15             82                 0    0
  7                  62                   0    14        0       3             32                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_positive_relationships, df_irl_raw$qid18object12response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  26     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                22    0
  3                   0                   0     0       84       0              0                 0    0
  4                   0                   0     0        0     179              0                 0    0
  5                   0                   0   498        0       0              0                 0    0
  6                   0                   0     0        0       0            235                 0    0
  7                 156                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_resilience, df_irl_raw$qid19object13response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  23     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                34    0
  3                   0                   0     0      145       0              0                 0    0
  4                   0                   0     0        0     248              0                 0    0
  5                   0                   0   504        0       0              0                 0    0
  6                   0                   0     0        0       0            159                 0    0
  7                  87                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_self_esteem, df_irl_raw$qid20object14response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  39     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                34    0
  3                   0                   0     0      110       0              0                 0    0
  4                   0                   0     0        0     264              0                 0    0
  5                   0                   0   460        0       0              0                 0    0
  6                   0                   0     0        0       0            176                 0    0
  7                 117                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_vitality, df_irl_raw$qid20object15response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  55     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                87    0
  3                   0                   0     0      247       0              0                 0    0
  4                   0                   0     0        0     325              0                 0    0
  5                   0                   0   330        0       0              0                 0    0
  6                   0                   0     0        0       0            108                 0    0
  7                  48                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
# For the rows that are not in the Irish dataset
df_pub$irl <- 0

# Merge both datasets
df_merged <- dplyr::bind_rows(df_pub, df_irl) |>
  dplyr::relocate(StartDate_irl, .after = StartDate) |>
  dplyr::relocate(Q_Language, .after = ResponseId) |>
  dplyr::relocate(age, .after = birth_year_orig) |>
  dplyr::relocate(sex_irl, .after = sex_orig) |>
  dplyr::relocate(ethnicity_citizenship_irl, .after = ethnicity_citizenship_orig) |>
  dplyr::relocate(employment_irl, .after = employment_orig) |>
  dplyr::relocate(education_irl, .after = education_orig) |>
  dplyr::relocate(income_irl, .after = income_orig)

# Total sample size before individual exclusion criteria
nrow(df_merged)
[1] 69408
# Sanity check:
# Is the sum of rows of both individual datasets equal to the merged dataset?
(length(df_irl$ResponseId) + length(df_pub$ResponseId)) ==
  length(df_merged$ResponseId)
[1] TRUE
# Extract the labels from df_pub and place them back to df_merged
for (i in intersect(names(df_pub), names(df_merged))) {
    attr(df_merged[[i]], "label") <- attr(df_pub[[i]], "label")
}

# Cleanup
rm(df_irl_raw)

MPWB

# Sanity check: View the counts of each option
for (i in mpwb_items) {
  eval(parse(text = sprintf("table_label(df_pub$%s)", i)))
  cat("\n")
}
$mpwb_positive_relationships
I receive help and support from people I am close to when I need it.
    1     2     3     4     5     6     7  <NA> 
 1751  1955  4594  7323 21636 14217 11715  5017 
Class: numeric 

$mpwb_meaning
I feel what I do in my life is valuable and worthwhile.
    1     2     3     4     5     6     7  <NA> 
 1966  2448  5393  8627 21037 13474 10271  4992 
Class: numeric 

$mpwb_competence
I feel a sense of accomplishment from what I do.
    1     2     3     4     5     6     7  <NA> 
 1907  2582  6319  9177 22141 13119  7974  4989 
Class: numeric 

$mpwb_engagement
I feel absorbed in what I am doing.
    1     2     3     4     5     6     7  <NA> 
 1229  2041  6701 11039 22993 12211  7040  4954 
Class: numeric 

$mpwb_self_esteem
I feel positive about myself.
    1     2     3     4     5     6     7  <NA> 
 2009  2703  6688  9186 21232 12895  8582  4913 
Class: numeric 

$mpwb_optimism
I am optimistic about my future.
    1     2     3     4     5     6     7  <NA> 
 2770  3118  6488 10098 19560 11972  9255  4947 
Class: numeric 

$mpwb_positive_emotion
I feel happy.
    1     2     3     4     5     6     7  <NA> 
 2114  2673  6253 12053 21446 11308  7423  4938 
Class: numeric 

$mpwb_emotional_stability
I feel calm and peaceful.
    1     2     3     4     5     6     7  <NA> 
 2571  3882 10398 11446 19752  9404  5835  4920 
Class: numeric 

$mpwb_resilience
I recover quickly from things that go wrong in my life.
    1     2     3     4     5     6     7  <NA> 
 2385  3916 10520 10552 21264  9366  5205  5000 
Class: numeric 

$mpwb_vitality
I feel full of energy.
    1     2     3     4     5     6     7  <NA> 
 3422  5107 11610 12272 17740  8110  5015  4932 
Class: numeric 
df_merged <- df_merged |>
  dplyr::rowwise() |>
  dplyr::mutate(
    # Identify participants that completed all MPWB items
    mpwb_n = base::sum(!is.na(dplyr::c_across(dplyr::all_of(mpwb_items)))),
    
    # Calculate variance, average and sum score of the MPWB items
    # explicitly to only for participants who answered all MPWB items
    mpwb_mean = dplyr::if_else(
      mpwb_n == 10,
      base::mean(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    ),

    mpwb_var = dplyr::if_else(
      mpwb_n == 10,
      stats::var(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    ),

    mpwb_sum = dplyr::if_else(
      mpwb_n == 10,
      base::sum(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    )
  ) |>
  
  # remove the rowwise computation
  dplyr::ungroup() |>
  
  # organise the variables positions
  dplyr::relocate(mpwb_n:mpwb_sum, .after = mpwb_vitality)
  
# Sanity check: View the new MPWB variables
dplyr::glimpse(df_merged |> dplyr::select(dplyr::starts_with("mpwb_")), width = 100)
Rows: 69,408
Columns: 14
$ mpwb_competence             <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability    <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement             <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning                <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism               <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion       <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem            <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality               <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ mpwb_n                      <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ mpwb_mean                   <dbl> 5.7, 4.1, 4.9, 5.6, 4.7, 4.5, 5.8, 7.0, 5.4, 6.1, 4.3, 5.9, 4.…
$ mpwb_var                    <dbl> 0.4555556, 2.3222222, 0.1000000, 0.7111111, 1.3444444, 2.05555…
$ mpwb_sum                    <dbl> 57, 41, 49, 56, 47, 45, 58, 70, 54, 61, 43, 59, 49, 61, 48, 40…
# Sanity check: Are there missing values in the sum score when mpwb_n is 10?
base::table(df_merged$mpwb_n, is.na(df_merged$mpwb_sum), useNA = "always")
      
       FALSE  TRUE  <NA>
  0        0  3247     0
  1        0   722     0
  2        0   396     0
  3        0   372     0
  4        0   275     0
  5        0   239     0
  6        0   240     0
  7        0   193     0
  8        0   152     0
  9        0   174     0
  10   63398     0     0
  <NA>     0     0     0
# Sanity check: Are there values in the var score when mpwb_n is not 10?
df_merged |> dplyr::filter(mpwb_n != 10 & (!is.na(mpwb_var))) |> base::nrow()
[1] 0
# Cleanup
rm(i)

Completion time

df_merged <- df_merged |>
  dplyr::rowwise() |>
  dplyr::mutate(

    # Count how many items were answered (not NA) after the debts item
    # (all items up to the debts item were forced-response)
    n_items_after = base::sum(!is.na(dplyr::c_across(
      c(
        followup,
        phq_interest,
        phq_down,
        gad_anxious,
        gad_worry,
        childhood_SES,
        fin_outlook,
        fin_outlook_conf,
        attention_care,
        work_arrangement
      )
    ))),

    # Calculate adjusted duration if the mandatory items were completed.
    # Some survey versions have different variables of the same item,
    # but all versions have 20 mandatory items before debts.
    total_items = dplyr::if_else(
      !is.na(debts_orig),
      20 + n_items_after,
      NA_real_),

    duration_adj = dplyr::if_else(
      !is.na(debts_orig),
      duration_sec / total_items,
      NA_real_)

  ) |>
  dplyr::ungroup() |>

  # organise the variables positions
  dplyr::relocate(n_items_after:duration_adj, .after = duration_sec)

# Sanity check: View the new variables
dplyr::glimpse(df_merged |> dplyr::select(duration_sec:duration_adj), width = 100)
Rows: 69,408
Columns: 4
$ duration_sec  <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, 6735, 120, 270…
$ n_items_after <int> 1, 10, 1, 10, 9, 10, 10, 1, 10, 9, 1, 10, 1, 10, 10, 10, 1, 10, 9, 1, 9, 10,…
$ total_items   <dbl> 21, 30, 21, 30, 29, 30, 30, 21, 30, 29, 21, 30, 21, 30, 30, 30, 21, 30, 29, …
$ duration_adj  <dbl> 48.952381, 14.733333, 17.619048, 14.200000, 17.655172, 11.466667, 11.366667,…
# Sanity check: Is there a mismatch between n_items_after and total_items?
base::table(df_merged$n_items_after, df_merged$total_items, useNA = "always")
      
          20    21    24    25    26    27    28    29    30  <NA>
  0     1899     0     0     0     0     0     0     0     0 13381
  1        0 14144     0     0     0     0     0     0     0     0
  4        0     0     6     0     0     0     0     0     0     0
  5        0     0     0    44     0     0     0     0     0     0
  6        0     0     0     0    80     0     0     0     0     0
  7        0     0     0     0     0     1     0     0     0     0
  8        0     0     0     0     0     0  1313     0     0     0
  9        0     0     0     0     0     0     0  9303     0     0
  10       0     0     0     0     0     0     0     0 29237     0
  <NA>     0     0     0     0     0     0     0     0     0     0
# Sanity check: Is there unexpected missing values in total_items?
df_merged |>
  dplyr::summarise(
    all_total_items_missing_when_debts_missing =
      all(is.na(total_items[is.na(debts_orig)])),
    any_total_items_present_when_debts_missing =
      any(!is.na(total_items[is.na(debts_orig)])))
# A tibble: 1 × 2
  all_total_items_missing_when_debts_missing any_total_items_present_when_debts_missing
  <lgl>                                      <lgl>                                     
1 TRUE                                       FALSE                                     
# Sanity check: Is there unexpected missing values in n_items_after?
base::table(df_merged$n_items_after, is.na(df_merged$debts_orig), useNA = "always")
      
       FALSE  TRUE  <NA>
  0     1899 13381     0
  1    14144     0     0
  4        6     0     0
  5       44     0     0
  6       80     0     0
  7        1     0     0
  8     1313     0     0
  9     9303     0     0
  10   29237     0     0
  <NA>     0     0     0
# Sanity check: View the range of duration_adj
df_merged |>
  dplyr::filter(!is.na(debts_orig)) |>
  dplyr::summarise(
    min_duration_adj = min(duration_adj, na.rm = TRUE),
    max_duration_adj = max(duration_adj, na.rm = TRUE)
  )
# A tibble: 1 × 2
  min_duration_adj max_duration_adj
             <dbl>            <dbl>
1              1.7           22309.
# Filter the duration adjusted for plotting
df_sub <- df_merged |>
  filter(duration_adj >= 0, duration_adj <= 25)
# Plot intra-individual variance vs time, faceted by country
ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_var)) +
  ggplot2::geom_point(alpha = 0.25, size = 0.8) +
  ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
  ggplot2::facet_wrap(~ country, scales = "free_y", ncol = 4, nrow = 25) +
  ggplot2::labs(
    x = "Duration adjusted (seconds)",
    y = "Within-person variance across MPWB"
  ) +
  ggplot2::theme(
    strip.text = ggplot2::element_text(size = 9, face = "bold"),
    axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
  )

ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_sum)) +
  ggplot2::geom_point(alpha = 0.2, size = 0.8) +
  ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
  ggplot2::facet_wrap(~ country, scales = "free", ncol = 4, nrow = 23) +
  ggplot2::labs(
    x = "Duration adjusted (seconds)",
    y = "MPWB Sum"
  ) +
  ggplot2::theme(
    strip.text = ggplot2::element_text(size = 9, face = "bold"),
    axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
  )

# Cleanup
rm(df_sub)

PHQ4

The original PHQ-4 has vague verbal anchors that could limit the comparability of results across languages. For example, the option “Several days” could be interpreted as 2-3 days or as more than 7 days in other languages. Since “One week” is not more than 7 days, we decided to recode it as “Several days”.

Used anchors Original anchors Recoded value
Never (1) Not at all (0) 0
Once or twice (1–2) (2) Several days (1) 1
A few days (3–4) (3) Several days (1) 1
Several days (4) Several days (1) 1
One week (5) Several days (1) 1
More than a week (6) More than half the days (2) 2
Every day / nearly every day(7) Nearly every day (3) 3
# Sanity check: View the counts of each option
for (i in phq4_items) {
  eval(parse(text = sprintf("table_label(df_merged$%s)", i)))
  cat("\n")
}
$phq_interest
Over the last 2 weeks, how often have you been bothered by the following problems? - Little interest or pleasure in doing things
    1     2     3     4     5     6     7  <NA> 
 6413 13900  8172  3943  1230  1948  4378 29424 
Class: numeric 

$phq_down
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling down, depressed or hopeless
    1     2     3     4     5     6     7  <NA> 
 8727 14216  6675  3362  1070  2068  3866 29424 
Class: numeric 

$gad_anxious
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling nervous, anxious or on edge
    1     2     3     4     5     6     7  <NA> 
 5755 13322  7569  4542  1275  2303  5218 29424 
Class: numeric 

$gad_worry
Over the last 2 weeks, how often have you been bothered by the following problems? - Not being able to stop or control worrying
    1     2     3     4     5     6     7  <NA> 
11900 11965  5229  3206  1240  2142  4302 29424 
Class: numeric 
# Function to recode PHQ-4 items.
recode_phq <- function(i) {
  dplyr::case_when(
    i == 1 ~ 0,
    i %in% 2:5 ~ 1,
    i == 6 ~ 2,
    i == 7 ~ 3,
    TRUE ~ NA_real_
  )
}

# Sanity check: Count missing values in PHQ-4 items when gad_worry is not missing
df_merged |>
  dplyr::filter(!is.na(gad_worry)) |>
  dplyr::summarise(
    dplyr::across(dplyr::all_of(phq4_items), ~ base::sum(is.na(.x))),
    n_total = dplyr::n()
  )
# A tibble: 1 × 5
  phq_interest phq_down gad_anxious gad_worry n_total
         <int>    <int>       <int>     <int>   <int>
1            0        0           0         0   39984
# Apply recoding and compute sum scores
# only for participants who answered all PHQ-4 items
# (i.e., not missing in the last PHQ item)
# gad_worry was the last item in the PHQ-4 matrix
df_merged <- df_merged |>
  dplyr::mutate(
    # Calculate the sums for phq2, gad2, and phq4
    # only for participants who answered all PHQ-4 items
    phq2_sum = dplyr::if_else(
      !is.na(gad_worry),
      phq_down + phq_interest,
      NA_real_
    ),

    gad2_sum = dplyr::if_else(
      !is.na(gad_worry),
      gad_worry + gad_anxious,
      NA_real_
    ),

    phq4_sum = phq2_sum + gad2_sum
  ) |>

  dplyr::mutate(
    # Apply the recoding function to the individual PHQ items
    dplyr::across(all_of(phq4_items), recode_phq, .names = "{.col}_rec"),

    # Calculate the sums for recoded phq2, gad2, and phq4
    # only for participants who answered all PHQ-4 items
    phq2_sum_rec = dplyr::if_else(
      !is.na(gad_worry),
      phq_down_rec + phq_interest_rec,
      NA_real_
    ),

    gad2_sum_rec = dplyr::if_else(
      !is.na(gad_worry),
      gad_worry_rec + gad_anxious_rec,
      NA_real_
    ),

    phq4_sum_rec = phq2_sum_rec + gad2_sum_rec,

    # Create a variable with cut-off labels
    phq4_cat = dplyr::case_when(
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 0 & phq4_sum_rec <= 2 ~ "Normal (0–2)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 3 & phq4_sum_rec <= 5 ~ "Mild (3–5)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 6 & phq4_sum_rec <= 8 ~ "Moderate (6–8)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 9 & phq4_sum_rec <= 12 ~ "Severe (9–12)",
      # I expect character values, so NA_character_
      TRUE ~ NA_character_
    ),

    # Create variables for depression and anxiety screening,
    # using the standard cut-off of 3 on the respective subscales
    depression_screen = dplyr::case_when(
      is.na(phq2_sum_rec) ~ NA_real_,
      phq2_sum_rec >= 3 ~ 1,
      TRUE ~ 0
    ),
    
    anxiety_screen = dplyr::case_when(
      is.na(gad2_sum_rec) ~ NA_real_,
      gad2_sum_rec >= 3 ~ 1,
      TRUE ~ 0
    )
  ) |>
  dplyr::relocate(phq2_sum:anxiety_screen, .after = gad_worry)

# Sanity checks (view the new variables)
dplyr::glimpse(
  df_merged |>
    dplyr::filter(!is.na(gad_worry)) |>
    dplyr::select(phq_interest:anxiety_screen),
  width = 100
)
Rows: 39,984
Columns: 17
$ phq_interest      <dbl> 2, 2, 3, 6, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 7, 6, 2, 1, 1, 2, 1, 2, 1, 3, …
$ phq_down          <dbl> 3, 1, 3, 4, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 6, 2, 1, 1, 2, 1, 2, 2, 1, …
$ gad_anxious       <dbl> 2, 2, 3, 7, 2, 2, 1, 3, 2, 3, 2, 1, 1, 2, 3, 7, 2, 1, 1, 2, 1, 1, 1, 2, …
$ gad_worry         <dbl> 1, 2, 3, 7, 1, 1, 1, 3, 2, 3, 2, 1, 2, 1, 1, 3, 2, 1, 1, 2, 1, 1, 2, 1, …
$ phq2_sum          <dbl> 5, 3, 6, 10, 2, 3, 2, 4, 3, 4, 4, 2, 4, 3, 10, 12, 4, 2, 2, 4, 2, 4, 3, …
$ gad2_sum          <dbl> 3, 4, 6, 14, 3, 3, 2, 6, 4, 6, 4, 2, 3, 3, 4, 10, 4, 2, 2, 4, 2, 2, 3, 3…
$ phq4_sum          <dbl> 8, 7, 12, 24, 5, 6, 4, 10, 7, 10, 8, 4, 7, 6, 14, 22, 8, 4, 4, 8, 4, 6, …
$ phq_interest_rec  <dbl> 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 3, 2, 1, 0, 0, 1, 0, 1, 0, 1, …
$ phq_down_rec      <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 0, …
$ gad_anxious_rec   <dbl> 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, …
$ gad_worry_rec     <dbl> 0, 1, 1, 3, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
$ phq2_sum_rec      <dbl> 2, 1, 2, 3, 0, 1, 0, 2, 1, 2, 2, 0, 2, 1, 4, 4, 2, 0, 0, 2, 0, 2, 1, 1, …
$ gad2_sum_rec      <dbl> 1, 2, 2, 6, 1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 1, 4, 2, 0, 0, 2, 0, 0, 1, 1, …
$ phq4_sum_rec      <dbl> 3, 3, 4, 9, 1, 2, 0, 4, 3, 4, 4, 0, 3, 2, 5, 8, 4, 0, 0, 4, 0, 2, 2, 2, …
$ phq4_cat          <chr> "Mild (3–5)", "Mild (3–5)", "Mild (3–5)", "Severe (9–12)", "Normal (0–2)…
$ depression_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ anxiety_screen    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
# Sanity check: View the range of the recoded variables
base::table(df_merged$phq2_sum_rec, useNA = "always")

    0     1     2     3     4     5     6  <NA> 
 4044  6610 21207  1987  2789   891  2456 29424 
base::table(df_merged$gad2_sum_rec, useNA = "always")

    0     1     2     3     4     5     6  <NA> 
 4500  8337 18360  1861  2634  1130  3162 29424 
base::table(df_merged$phq4_sum_rec, useNA = "always")

    0     1     2     3     4     5     6     7     8     9    10    11    12  <NA> 
 1958  2617  4413  6396 13785  1936  2304  1135  1600   794  1046   496  1504 29424 
# Sanity check: Is there mismatch missing values between the two screenings?
dplyr::count(df_merged, depression_screen, anxiety_screen, name = "n")
# A tibble: 5 × 3
  depression_screen anxiety_screen     n
              <dbl>          <dbl> <int>
1                 0              0 28651
2                 0              1  3210
3                 1              0  2546
4                 1              1  5577
5                NA             NA 29424
# Cleanup
rm(recode_phq, i)

Life Satisfaction

# Item is already numeric and within the scale.
table_label(df_merged$life_satisfaction)
$life_satisfaction
Overall, how satisfied are you with life as a whole these days?
    0     1     2     3     4     5     6     7     8     9    10  <NA> 
 1825   945  1844  3208  4266  6865  9228 12343 11174  5508  6005  6197 
Class: numeric 

Income, Assets, and Debts

The open text field contained a Qualtrics validation that forced participants to answer only with digits [0-9], commas, and periods. However, a small amount of participants managed to enter values beyond this validation (e.g., including percentage signs, letters, or other characters).

For the countries where digits 0-9 are not the default numeric keypad, the translations included instructions requesting that participants use only digits 0-9 (Algeria, Bahrain, Chad, Egypt, Kuwait, Morocco, Oman, Saudi Arabia, UAE, Lebanon, Qatar).

Clean numbers
# Sanity check: View the counts of each option
# Option 10 is "Specify: [open text field]"
table_label(df_merged$income_orig)
$income_orig
Please indicate what your total household income was for 2024 (before taxes). You can select an option or indicate a precise value. If you are retired or live off a pension, please indicate the amount your household received during the year in total payments. - Selected Choice
   0    1    2    3    4    5    6    7    8    9   10 <NA> 
2450 5532 6495 6707 6593 5832 5160 5042 3933 5890 6974 8800 
Class: numeric 
# Sanity check: View the variable types
class(df_merged$income_text_orig)
[1] "character"
class(df_merged$assets_orig)
[1] "character"
class(df_merged$debts_orig)
[1] "character"
# Participants were able to write in a open text field their income, assets, and debts.
head(unique(df_merged$income_text_orig), 20)
 [1] NA         "8000"     "15000"    "7000"     "243000"   "124000"   "12345678" "150000"   "700.000"  "10000"    "50000"    "400"      "2500"    
[14] "1500"     "6000"     "300000"   "3600"     "636000"   "643"      "435"     
head(unique(df_merged$assets_orig), 20)
 [1] "5"         "2"         "20.000"    "1000000"   "5000"      "0,00"      "250000"    "1,000.00"  "50000"     "00"        "0"         "700000"   
[13] "20000000"  "600000"    "70000000"  "1"         "100,000"   "50000000"  "7000000"   "350000000"
head(unique(df_merged$debts_orig), 20)
 [1] "10000000"    "2"           "18000"       "0"           "125000"      "0,00"        "1,000.00"    "20.000,00"   "1500"        "200000000"  
[11] "90000"       "150000"      "200000"      "1300000"     "10,000"      "10000"       "120000"      "66000000"    "100,000,000" "200"        
# View values that end with "," or "."
df_merged |>
  dplyr::filter(grepl("[.,]$", income_text_orig)) |>
  dplyr::select(ResponseId, income_text_orig) |>
  base::nrow();
[1] 0
df_merged |>
  dplyr::filter(grepl("[.,]$", assets_orig)) |>
  dplyr::select(ResponseId, assets_orig);
# A tibble: 18 × 2
   ResponseId        assets_orig 
   <chr>             <chr>       
 1 R_2ilYHj1poprgCX8 1,00,000,   
 2 R_2Iaw1PAzIm22N4f 1,500,000.  
 3 R_7Xai7kgm6ni70up 1000000.    
 4 R_3Ezenl8l5Vbehqq 650000.     
 5 R_8EouiGcGN3SO3RO 200000.     
 6 R_7QEIOPC6sqjQ7jF 300000,     
 7 R_8NwCae5exBdtR98 600,000.    
 8 R_3wBLehUhjYbWTbq 100000.     
 9 R_9d3Tm1Wu2M6gFoh 0,          
10 R_2E6m2ErMvEOb0wx 4000.       
11 R_6elc9peo8vxbMVH 3000.       
12 R_7rYZllIWkzJDJ2X 10.         
13 R_9GHpnvrI5tXbopH 350000.     
14 R_7sbQ258PDYZf45A 2,000000.   
15 R_1Hc7FpY3tW9nsh7 500,000.    
16 R_9dhgf8xvk6Ib8LX 100000.     
17 R_8eOIl90Z2J6iB6k 5,000,000.  
18 R_8CSIx79Mqkq1qaB 600,000,000,
df_merged |>
  dplyr::filter(grepl("[.,]$", debts_orig)) |>
  dplyr::select(ResponseId, debts_orig)
# A tibble: 6 × 2
  ResponseId        debts_orig
  <chr>             <chr>     
1 R_7QEIOPC6sqjQ7jF 23000,    
2 R_1wuhfjwEOnWp9AS 0.        
3 R_16SQZLnjugK3f6p 0.        
4 R_5bW2dvfC8MaUgLB 5,00.     
5 R_8CB0K2YQUfWxGY1 0.        
6 R_5xPMFVkMda7RhuS 18000.    
# Create function to clean numbers
clean_number <- function(i) {
  parse_one <- function(s) {
    # Keep NA as NA
    if (is.na(s))
      return(NA_real_)

    # Remove leading/trailing spaces
    s <- stringr::str_trim(s)

    # first character must be a digit, otherwise NA
    if (!stringr::str_detect(s, "^[0-9]"))
      return(NA_real_)

    # If contains "%" or "x", set to NA
    if (stringr::str_detect(s, "%") || stringr::str_detect(s, "[xX]"))
      return(NA_real_)

    # Handle scientific notation ( if e/E is present)
    if (stringr::str_detect(s, "[eE]")) {
      s_sci <- s |>
        stringr::str_replace_all(",", ".") |>
        stringr::str_replace_all("[^0-9eE+\\-\\.]", "")
      val <- as.numeric(s_sci)
      return(val)

    }

    # Remove non-numeric characters (except "." and ",")
    s <- stringr::str_remove_all(s, "[^0-9,\\.]")

    # Allow "0"
    if (s == "0")
      return(0)

    # Place values of 0.0 / 0.00 / 0,0 / 0,00 / 0,000 as 0
    if (stringr::str_detect(s, "^0[\\.,]0{1,3}$"))
      return(0)

    # Otherwise, anything else starting with 0 and longer than 1 char -> NA
    # For example: "007", "01", "0.7", "0,7", "0.000", "0,000", "0002"
    if (stringr::str_detect(s, "^0") && base::nchar(s) > 1)
      return(NA_real_)

    # Remove "." or "," at the very end
    # For example: "1.000.000." -> "1.000.000"
    s <- stringr::str_replace(s,"[,\\.]$","")

    # Identify last occurrence of "," or "." as decimal separator
    # Some countries use "," as decimal separator and others use "."
    m <- stringr::str_match(s, "([,\\.])([0-9]*)$")

    if (!is.na(m[1])) {
      # Count the number of digits after the last separator
      sep <- m[2]
      digits_after <- m[3]
      len <- base::nchar(digits_after)

      if (len >= 3) {

        # Thousands separator, remove all separators
        # For example: "1.000.000" -> "1000000"
        s <- stringr::str_remove_all(s, "[,.]")

      } else {

        # Decimal, keep only last separator as decimal
        # Remove all other separators
        # For example: "1.000.000.00" -> "1000000.00"
        # "1,000,000,00" -> "1000000,00"
        s_wo_last <- stringr::str_sub(s, 1, nchar(s) - len - 1)
        s_wo_last <- stringr::str_remove_all(s_wo_last, "[,.]")

        # This R session uses "." as decimal separator,
        # so we need to convert accordingly
        # For example: "1000000,00" -> "1000000.00"
        s <- paste0(s_wo_last, ".", digits_after)
      }
    }

    # In R, numerical values have 53 bits of precision (9.0e15),
    # so very large numbers that exceed R's numeric limit will be rounded
    # to the nearest representable double.
    # For example, as.numeric("9999999999999999999") returns 10000000000000002048.
    as.numeric(s)
  }

  vapply(i, parse_one, numeric(1))
}

# Sanity check:
clean_number(c(",1", "0.1", "0,75", "1%", "1000", "1000000,00", "1.000",
               "1,00,000", "1.000.000.00", "1.000.000.", "0010", "10x", "7e-1",
               "9999999999999999999", "0", "0.0", "0,0", "0.00", "0,00", "07",
               "0.7", "0,7", "00", "00,00", "00.00"))
                    ,1                    0.1                   0,75                     1%                   1000             1000000,00 
                    NA                     NA                     NA                     NA                 1000.0              1000000.0 
                 1.000               1,00,000           1.000.000.00             1.000.000.                   0010                    10x 
                1000.0               100000.0              1000000.0              1000000.0                     NA                     NA 
                  7e-1    9999999999999999999                      0                    0.0                    0,0                   0.00 
                   0.7 10000000000000002048.0                    0.0                    0.0                    0.0                    0.0 
                  0,00                     07                    0.7                    0,7                     00                  00,00 
                   0.0                     NA                     NA                     NA                     NA                     NA 
                 00.00 
                    NA 
# Apply function to the values in open text fields
df_merged <- df_merged |>
  dplyr::mutate(
    income_text_clean = clean_number(income_text_orig),
    assets_clean = clean_number(assets_orig),
    debts_clean = clean_number(debts_orig)) |>
  dplyr::relocate(income_text_clean, .after = income_text_orig) |>
  dplyr::relocate(assets_clean, .after = assets_orig) |>
  dplyr::relocate(debts_clean, .after = debts_orig)

# Sanity check: View changes between original and cleaned income text
df_merged |>
  dplyr::mutate(
    income_text_clean = as.character(income_text_clean),
    n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
  ) |>
  dplyr::filter(income_text_clean != income_text_orig) |>
  dplyr::select(
    ResponseId,
    income_text_orig,
    income_text_clean,
    n_digits_orig,
    n_digits_clean
  ) |>
  print_reactable(sorted_col = "income_text_orig", width = 800)
# Sanity check: View changes between original and cleaned assets text
df_merged |>
  dplyr::mutate(
    assets_clean = as.character(assets_clean),
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
  ) |>
  dplyr::filter(assets_clean != assets_orig) |>
  dplyr::select(
    ResponseId,
    assets_orig,
    assets_clean,
    n_digits_orig,
    n_digits_clean
  )  |>
  print_reactable(sorted_col = "assets_orig", width = 800)
# Sanity check: View the new cleaned variables
df_merged |>
  dplyr::select(
    income_orig,
    income_text_orig,
    income_text_clean,
    assets_orig,
    assets_clean,
    debts_orig,
    debts_clean
  ) |>
  dplyr::glimpse(width = 150)
Rows: 69,408
Columns: 7
$ income_orig       <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, 9, 7, 2, 3, 3, 1, 9, 10, 4, 4, 4, 8, 8, 9, 9, 10, 2, 3, 10, …
$ income_text_orig  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "8000", "15000", NA, NA, NA, NA, NA, NA, NA, NA, NA, "7000…
$ income_text_clean <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8000, 15000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7000, NA,…
$ assets_orig       <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,000.00", "50000", "00", "0", "1000000", "0", "700000", "200000…
$ assets_clean      <dbl> 5, 2, 20000, 1000000, 5000, 0, 250000, 1000, 50000, NA, 0, 1000000, 0, 700000, 20000000, 0, 0, 600000, 70000000, 1, 0, 0, …
$ debts_orig        <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.00", "0", "20.000,00", "1500", "0", "0", "0", "200000000", "0…
$ debts_clean       <dbl> 10000000, 2, 18000, 0, 125000, 0, 0, 1000, 0, 20000, 1500, 0, 0, 0, 200000000, 0, 90000, 0, 0, 0, 150000, 0, 200000, 0, 0,…
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
  n_income_orig_text = sum(!is.na(income_text_orig)),
  n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
  n_assets_orig = sum(!is.na(assets_orig)),
  n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
  n_debts_orig = sum(!is.na(debts_orig)),
  n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)
# A tibble: 1 × 6
  n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
               <int>                  <int>         <int>             <int>        <int>            <int>
1               6973                      0         56550               861        56027              909
# Sanity check: View the new cleaned variables
# View values that contain non-numeric characters besides "." and ","
df_merged |>
  dplyr::filter(!stringr::str_detect(income_text_orig, "^[0-9,\\.]+$") &
      !is.na(income_orig)) |> select(income_text_orig, income_text_clean) |>
  base::nrow();
[1] 0
df_merged |>
  dplyr::filter(!stringr::str_detect(assets_orig, "^[0-9,\\.]+$") &
      !is.na(assets_orig)) |> select(assets_orig, assets_clean);
# A tibble: 17 × 2
   assets_orig assets_clean
   <chr>              <dbl>
 1 -0                    NA
 2 +10000                NA
 3 40%                   NA
 4 4.5e7           45000000
 5 1.78e10      17800000000
 6 -0                    NA
 7 +80.000               NA
 8 6.265e9       6265000000
 9 -0                    NA
10 0x0                   NA
11 1.3425e10    13425000000
12 10 %                  NA
13 10%                   NA
14 2%                    NA
15 30%                   NA
16 0%                    NA
17 +1000000              NA
df_merged |>
  dplyr::filter(!stringr::str_detect(debts_orig, "^[0-9,\\.]+$") &
      !is.na(debts_orig)) |> select(debts_orig, debts_clean)
# A tibble: 10 × 2
   debts_orig debts_clean
   <chr>            <dbl>
 1 7%                  NA
 2 50%                 NA
 3 8.95e8       895000000
 4 5%                  NA
 5 , 60000             NA
 6 10%                 NA
 7 10%                 NA
 8 20%                 NA
 9 ,0%                 NA
10 +4000               NA
Add financial country-level values
fin_values <-
  readr::read_csv("111_country_variables.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 126
Columns: 23
$ country                          <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "…
$ language                         <chr> "Albanian", "Arabic", "Portuguese", "Spanish", "Armenian"…
$ UserLanguage                     <chr> "SQI-ALB", "AR-DZA", "PT-AGO", "ES-ARG", "AM-ARM", "EN-AU…
$ income_period                    <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "a…
$ income_type                      <chr> "gross", "gross", "gross", "gross", "gross", "gross", "ne…
$ income_year                      <dbl> 2024, 2025, 2024, 2024, 2024, 2025, 2024, 2024, 2024, 202…
$ income_currency                  <chr> "lek", "د.ج", "Kz", "$ (peso)", "ՀՀ դրամ", "AU$", "€", "د…
$ income_currency_position         <chr> "left", "right", "left", "right", "left", "right", "left"…
$ income_cutoff_min                <dbl> 12000, 10000, 50000, 250000, 15000, 40000, 14508, 200, 20…
$ assets_cutoff_min                <dbl> 1000, 10000, 0, 100, 10000, 500, 0, 1000, 1000, 100, 100,…
$ debts_cutoff_min                 <dbl> 1000, 1000, 10000, 100, 10000, 0, 0, 100, 100, 10, 100, 1…
$ assets_upper_limit               <dbl> 40000001, 100000000, NA, 350000000, 50000000, 30000000, 2…
$ debts_upper_limit                <dbl> 50000001, 300000000, NA, 350000000, 50000000, 3000000, 10…
$ wages_per_year                   <dbl> 12, 12, 13, 13, 12, NA, NA, NA, 12, NA, 13, 13, 13, 12, N…
$ inflation2024_factor             <dbl> NA, 1.0010, NA, NA, NA, 1.0182, NA, NA, NA, NA, NA, NA, N…
$ one_local_unit_to_USD_conversion <dbl> 0.010738447, 0.007728573, 0.001149628, 0.001093261, 0.002…
$ one_USD_to_local_unit_conversion <dbl> 93.123, 129.390, 869.846, 914.695, 392.730, 1.531, 0.924,…
$ country_region                   <chr> "Europe & Central Asia", "Middle East, North Africa, Afgh…
$ continent                        <chr> "Europe", "MENA", "Africa", "South America", "Europe", "O…
$ country_incomegroup              <chr> "Upper middle income", "Upper middle income", "Lower midd…
$ soft_launch                      <chr> "June 2", "June 7", "June 2", "June 2", "June 2", "June 5…
$ target_size                      <dbl> 300, 600, 600, 600, 300, 600, 300, 300, 300, 1200, 600, 6…
$ comment_country                  <chr> NA, "Collaborator said that the household income values a…
# Join financial country-level values to the main dataset
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(dplyr::select(fin_values, -country), by = "UserLanguage")

# Sanity check: Number of rows should remain the same
nrow(df_merged)
[1] 69408
Create categorical variables
# Add categorical variable
df_merged <- df_merged |>
  dplyr::mutate(

    # Considers all options
    income_orig_cat_11 =
      dplyr::case_when(
        income_orig == 0 ~ "No income",
        income_orig == 1 ~ "Second decile",
        income_orig == 2 ~ "Third decile",
        income_orig == 3 ~ "Fourth decile",
        income_orig == 4 ~ "Fifth decile",
        income_orig == 5 ~ "Sixth decile",
        income_orig == 6 ~ "Seventh decile",
        income_orig == 7 ~  "Eighth decile",
        income_orig == 8 ~ "Ninth decile",
        income_orig == 9 ~ "Tenth decile",
        income_orig == 10 ~ "Specify",
        TRUE ~ NA_character_
     ),

    # Only considers the first 10 options and gives NA to "Specify"
    income_orig_cat_10 =
      dplyr::case_when(
        income_orig == 0 ~ "No income",
        income_orig == 1 ~ "Second decile",
        income_orig == 2 ~ "Third decile",
        income_orig == 3 ~ "Fourth decile",
        income_orig == 4 ~ "Fifth decile",
        income_orig == 5 ~ "Sixth decile",
        income_orig == 6 ~ "Seventh decile",
        income_orig == 7 ~ "Eighth decile",
        income_orig == 8 ~ "Ninth decile",
        income_orig == 9 ~ "Tenth decile",
        TRUE ~ NA_character_
     )
    ) |>
  dplyr::relocate(income_orig_cat_11, income_orig_cat_10, .after = income_orig)

# Sanity check: View the mapping distribution of the new income variables
df_merged |> dplyr::count(income_orig, income_orig_cat_11)
# A tibble: 12 × 3
   income_orig income_orig_cat_11     n
         <dbl> <chr>              <int>
 1           0 No income           2450
 2           1 Second decile       5532
 3           2 Third decile        6495
 4           3 Fourth decile       6707
 5           4 Fifth decile        6593
 6           5 Sixth decile        5832
 7           6 Seventh decile      5160
 8           7 Eighth decile       5042
 9           8 Ninth decile        3933
10           9 Tenth decile        5890
11          10 Specify             6974
12          NA <NA>                8800
df_merged |> dplyr::count(income_orig, income_orig_cat_10)
# A tibble: 12 × 3
   income_orig income_orig_cat_10     n
         <dbl> <chr>              <int>
 1           0 No income           2450
 2           1 Second decile       5532
 3           2 Third decile        6495
 4           3 Fourth decile       6707
 5           4 Fifth decile        6593
 6           5 Sixth decile        5832
 7           6 Seventh decile      5160
 8           7 Eighth decile       5042
 9           8 Ninth decile        3933
10           9 Tenth decile        5890
11          10 <NA>                6974
12          NA <NA>                8800
Add income bracket information

Country-specific adjustments were applied for an efficient mapping. For example, due to the phrasing, some countries had overlapping values in the brackets: if the last bracket was “more than 4500” and 4500 was the same as the low point of the previous bracket.

# Load the income bracket information and apply country-specific adjustments.
income_recoded <- base::readRDS("111_income_recoded.rds") |>
  dplyr::mutate(
    income_lowpoint =
      dplyr::case_when(
        # Correct Mongolia's income bracket error. Where it reads
        # "₮1,700,001 – ₮2,000,00" should be "₮1,700,001 – ₮2,000,000".
        # Any reasonable person would be able to spot that,
        # if they even noticed it.
        UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 1700001,
        
        # Qatar 5th bracket: AR-QAT: [150000-250000]; EN-QAT: [150001-250000]
        UserLanguage == "AR-QAT" & income_orig == 5 ~ 150001,
        
        # Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
        # per month" in the middle of the deciles.
        UserLanguage == "AR-MAR" & income_orig == 2 ~ 1500,

        TRUE ~ income_lowpoint
    ),

    income_highpoint =
      dplyr::case_when(
        UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 2000000,

        # Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
        # per month" in the middle of the deciles.
        UserLanguage == "AR-MAR" & income_orig == 2 ~ 2500,

        # Correct Uzbekistan's income brackets so the highpoint of each decile
        # matches the lowpoint of the next decile (e.g., coding 14.9 mln as 14999999
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 3 ~ 4999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 4 ~ 9999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 5 ~ 14999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 6 ~ 19999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 7 ~ 24999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 8 ~ 29999999,

        # Georgia 2nd bracket: KA-GEO 0-500; EN-GEO 0-550.
        UserLanguage == "KA-GEO" & income_orig == 1 ~ 550,
        
        # Kyrgyzstan 8th bracket: KY-KGZ [100000-119000]; RU-KGZ [100000-119999]. 
        UserLanguage == "KY-KGZ" & income_orig == 8 ~ 119999,
        
        # Ar-TCD 3rd bracket overlaps with the 2nd bracket and do not match with
        # FR-TCD's 3rd bracket.
        UserLanguage == "AR-TCD" & income_orig == 3 ~ 3000000,

        TRUE ~ income_highpoint
    ),
  )

# Sanity check: Any country have the different brackets across languages?
# We expect to only have differences between Ireland's sponsored and main versions.
income_recoded |>
  dplyr::group_by(country, income_orig) |>
  dplyr::summarise(
    n_lang = dplyr::n_distinct(UserLanguage),
    n_brackets = dplyr::n_distinct(
      paste(income_lowpoint, income_highpoint)
    ),
    bracket_defs = paste0(
      UserLanguage, ": [", income_lowpoint, "-", income_highpoint, "]",
      collapse = "; "
    ),
    .groups = "drop"
  ) |> dplyr::filter(n_lang > 1, n_brackets > 1)
# A tibble: 9 × 5
  country income_orig n_lang n_brackets bracket_defs                                              
  <chr>         <int>  <int>      <int> <chr>                                                     
1 Ireland           1      2          2 EN-IRL: [0-17500]; EN-IRL-sponsored: [0-22000]            
2 Ireland           2      2          2 EN-IRL: [17500-24999]; EN-IRL-sponsored: [22001-32000]    
3 Ireland           3      2          2 EN-IRL: [25000-34999]; EN-IRL-sponsored: [32001-42000]    
4 Ireland           4      2          2 EN-IRL: [35000-49999]; EN-IRL-sponsored: [42001-55000]    
5 Ireland           5      2          2 EN-IRL: [50000-74999]; EN-IRL-sponsored: [55001-67000]    
6 Ireland           6      2          2 EN-IRL: [75000-99999]; EN-IRL-sponsored: [67001-85000]    
7 Ireland           7      2          2 EN-IRL: [100000-149999]; EN-IRL-sponsored: [85001-105000] 
8 Ireland           8      2          2 EN-IRL: [150000-200000]; EN-IRL-sponsored: [105001-137000]
9 Ireland           9      2          2 EN-IRL: [200000-NA]; EN-IRL-sponsored: [137000-NA]        
# Correct gaps between brackets
income_gaps <- income_recoded |>
  dplyr::group_by(UserLanguage) |>
  dplyr::arrange(income_orig, .by_group = TRUE) |>
  
  # First check lowpoints
  dplyr::mutate(
    prev_high = dplyr::lag(income_highpoint),
    expected_low = prev_high + 1,
    has_gap = income_orig >= 2 &
      income_orig <= 8 &
      !is.na(prev_high) &
      !is.na(income_lowpoint) &
      income_lowpoint != expected_low,
    income_lowpoint_adj = dplyr::if_else(
      has_gap,
      expected_low,
      income_lowpoint
    ),
      
    # Then highpoints
    next_low = dplyr::lead(income_lowpoint_adj),
    expected_high = next_low - 1L,
    high_needs_fix =
      income_orig >= 2 &
      income_orig <= 8 &
      !is.na(next_low) &
      !is.na(income_highpoint) &
      income_highpoint != expected_high,
    income_highpoint_adj = dplyr::if_else(
      high_needs_fix,
      expected_high,
      income_highpoint
    )
  ) |>
  dplyr::ungroup()

# Sanity check: View languages where there is a gap
income_gaps |>
  dplyr::filter(has_gap) |>
  dplyr::select(
    UserLanguage,
    income_orig,
    prev_high,
    income_lowpoint,
    expected_low
  )  |>
  print_reactable(sorted_col = "UserLanguage", width = 800)
# Transform income_recoded into a wider format for merging
income_info <- income_gaps |>
  dplyr::select(UserLanguage, income_orig, 
                income_lowpoint, income_lowpoint_adj, 
                income_highpoint, income_highpoint_adj) |>
  tidyr::pivot_longer(
    cols = c(income_lowpoint, income_lowpoint_adj, 
             income_highpoint, income_highpoint_adj),
    names_to = "bound",
    values_to = "value"
  ) |>
  tidyr::pivot_wider(
    names_from = c(bound, income_orig),
    values_from = value,
    names_sep = "_"
  ) |> dplyr::glimpse(width = 100)
Rows: 125
Columns: 37
$ UserLanguage           <chr> "AM-ARM", "AM-ETH", "AR-ARE", "AR-BHR", "AR-DZA", "AR-EGY", "AR-KWT…
$ income_lowpoint_1      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_lowpoint_adj_1  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_highpoint_1     <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_highpoint_adj_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_lowpoint_2      <dbl> 24001, 601, 60000, 301, 15000, 70001, 500, 18000000, 1500, 500, 500…
$ income_lowpoint_adj_2  <dbl> 24001, 601, 60001, 301, 15001, 70001, 501, 18000001, 1501, 500, 500…
$ income_highpoint_2     <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_highpoint_adj_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_lowpoint_3      <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000000, 2500, 1000…
$ income_lowpoint_adj_3  <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000001, 2501, 1000…
$ income_highpoint_3     <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_highpoint_adj_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_lowpoint_4      <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000000, 4000, 150…
$ income_lowpoint_adj_4  <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000001, 4001, 150…
$ income_highpoint_4     <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_highpoint_adj_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_lowpoint_5      <dbl> 192000, 2401, 240000, 1201, 50000, 600001, 2000, 90000000, 6000, 20…
$ income_lowpoint_adj_5  <dbl> 192001, 2401, 240000, 1201, 50000, 600001, 2000, 90000001, 6001, 20…
$ income_highpoint_5     <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_highpoint_adj_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_lowpoint_6      <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000000, 8000, 2…
$ income_lowpoint_adj_6  <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000001, 8001, 2…
$ income_highpoint_6     <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_highpoint_adj_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_lowpoint_7      <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000000, 10000…
$ income_lowpoint_adj_7  <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000001, 10001…
$ income_highpoint_7     <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_highpoint_adj_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_lowpoint_8      <dbl> 960000, 10001, 500000, 2001, 150000, 2400001, 5000, 200000000, 1250…
$ income_lowpoint_adj_8  <dbl> 960001, 10001, 500000, 2001, 150000, 2400001, 5000, 200000001, 1250…
$ income_highpoint_8     <dbl> 1200000, 20000, 699999, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_adj_8 <dbl> 1200000, 19999, 699999, 2299, 199999, 4799999, 5999, 299999999, 149…
$ income_lowpoint_9      <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_lowpoint_adj_9  <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ income_highpoint_adj_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(income_info, by = "UserLanguage") |>
  dplyr::relocate(income_lowpoint_1:income_highpoint_adj_9,
    .after = income_orig_cat_10
  )

# Sanity check
nrow(df_merged)
[1] 69408
Identify strange numbers in income, assets, and debts
# Create function to identify strange numbers.
weird_nr <- function(i) {

  # Temporary transform into a character vector so we can use stringr functions
  s <- as.character(i)

  # Flag numbers with the same non-zero digit repeated >=4 (e.g., 1111, 9999)
  # except for zeros.
  rep4 <-
    stringr::str_detect(s, "(?:1111|2222|3333|4444|5555|6666|7777|8888|9999)")

  # Flag sequential numbers of length >= 3 ascending or descending
  # (e.g., 123, 1234, 4321)
  asc3  <- stringr::str_detect(s, "(?:123|234|345|456|567|678|789)")
  desc3 <- stringr::str_detect(s, "(?:321|432|543|654|765|876|987)")

  # Flag repeated 2-digit blocks (e.g., 3939, 1212, 4545)
  repeat2 <- stringr::str_detect(s, "(?!0{2})(\\d{2})\\1+")

  # Combine all flags and check if any is TRUE
  outcome <- (rep4 | asc3 | desc3 | repeat2)

  # Make NAs as not weird
  outcome[is.na(outcome)] <- FALSE

  outcome
}

# Sanity check:
weird_nr(c(999999, 12340, 43210, 3939, 540000, 75000, NA))
[1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
# Apply function to financial variables
df_merged <- df_merged |>
  dplyr::mutate(
    income_wrd = weird_nr(income_text_clean) |
      # Also detect rows where original text exists but cleaning is NA
      (!is.na(income_text_orig) & is.na(income_text_clean)),

    assets_wrd = weird_nr(assets_clean) |
      (!is.na(assets_orig) & is.na(assets_clean)),

    debts_wrd  = weird_nr(debts_clean) |
      (!is.na(debts_orig) & is.na(debts_clean))
  ) |>
  relocate(income_wrd, .after = income_text_clean) |>
  relocate(assets_wrd, .after = assets_clean) |>
  relocate(debts_wrd, .after = debts_clean)

# Sanity check: View the counts of weird numbers per variable
base::table(df_merged$income_wrd, useNA = "always")

FALSE  TRUE  <NA> 
69353    55     0 
base::table(df_merged$assets_wrd, useNA = "always")

FALSE  TRUE  <NA> 
68386  1022     0 
base::table(df_merged$debts_wrd, useNA = "always")

FALSE  TRUE  <NA> 
68411   997     0 
# Sanity check: View changes between original and cleaned income text
df_merged |>
  dplyr::mutate(
    income_text_clean = as.character(income_text_clean),
    n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
  ) |>
  dplyr::filter(
    income_text_clean != income_text_orig | 
      (!is.na(income_text_orig) & is.na(income_text_clean))) |>
  dplyr::group_by(
    income_text_orig,
    income_text_clean,
    income_wrd,
    n_digits_orig,
    n_digits_clean
  ) |> 
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "income_text_orig", width = 800)
# Sanity check: View changes between original and cleaned assets text
df_merged |>
  dplyr::mutate(
    assets_clean = as.character(assets_clean),
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
  ) |>
  dplyr::filter(
    assets_clean != assets_orig | (!is.na(assets_orig) & is.na(assets_clean))) |>
  dplyr::group_by(
    assets_orig,
    assets_clean,
    assets_wrd,
    n_digits_orig,
    n_digits_clean
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_orig", width = 800)
# Sanity check: View changes between original and cleaned debts text
df_merged |>
  dplyr::mutate(
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(debts_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(debts_clean, "[0-9]")
  ) |>
  dplyr::filter(
    debts_clean != debts_orig | (!is.na(debts_orig) & is.na(debts_clean)
    )) |>
  dplyr::group_by(
    debts_orig,
    debts_clean,
    debts_wrd,
    n_digits_orig,
    n_digits_clean
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "debts_orig", width = 500)
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
  n_income_orig_text = sum(!is.na(income_text_orig)),
  n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
  n_assets_orig = sum(!is.na(assets_orig)),
  n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
  n_debts_orig = sum(!is.na(debts_orig)),
  n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)
# A tibble: 1 × 6
  n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
               <int>                  <int>         <int>             <int>        <int>            <int>
1               6973                      0         56550               861        56027              909
# Sanity check: View the rows with NA in cleaned values
# but original text exists
df_merged |> dplyr::group_by(income_text_orig, income_text_clean, income_wrd) |>
  dplyr::filter(!is.na(income_text_orig) & is.na(income_text_clean)) |>
  dplyr::summarise(n = dplyr::n()) |> base::nrow()
[1] 0
df_merged |> dplyr::group_by(assets_orig, assets_clean, assets_wrd) |>
  dplyr::filter(!is.na(assets_orig) & is.na(assets_clean)) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_orig", width = 500)
df_merged |> dplyr::group_by(debts_orig, debts_clean, debts_wrd) |>
  dplyr::filter(!is.na(debts_orig) & is.na(debts_clean)) |>
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "debts_orig", width = 500)
# Cleanup
# We will need income_recoded, income_gaps, and income_info later
rm(fin_values, clean_number, weird_nr)

Household Size

# Sanity check: View the options of this item
table_label(df_merged$household_size)
$household_size
How many people in your household are covered by these finances? Put 1 if you live alone, or if you live with others (e.g., roommates) but are financially independent from them and vice-versa, put 1. Otherwise, list the total number of people living with you that are part of household finances (both incomes and expenses).
    1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19    20  <NA> 
15591 15309 10311  9886  4984  2109   948   596   243   312    71    73    31    29    48    21     8    16    17   115  8690 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    household_size_group = factor(
      dplyr::case_when(
        household_size == 1 ~ "1",
        household_size == 2 ~ "2",
        household_size == 3 ~ "3",
        household_size %in% c(4, 5) ~ "4-5",
        household_size >= 6 ~ "6-20",
        TRUE ~ NA_character_
      ),
      levels = c("1", "2", "3", "4-5", "6-20"),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(household_size_group, .after = household_size)

# Sanity check: View the mapping distribution of the new household size variable
base::table(df_merged$household_size, df_merged$household_size_group, useNA = "always")
      
           1     2     3   4-5  6-20  <NA>
  1    15591     0     0     0     0     0
  2        0 15309     0     0     0     0
  3        0     0 10311     0     0     0
  4        0     0     0  9886     0     0
  5        0     0     0  4984     0     0
  6        0     0     0     0  2109     0
  7        0     0     0     0   948     0
  8        0     0     0     0   596     0
  9        0     0     0     0   243     0
  10       0     0     0     0   312     0
  11       0     0     0     0    71     0
  12       0     0     0     0    73     0
  13       0     0     0     0    31     0
  14       0     0     0     0    29     0
  15       0     0     0     0    48     0
  16       0     0     0     0    21     0
  17       0     0     0     0     8     0
  18       0     0     0     0    16     0
  19       0     0     0     0    17     0
  20       0     0     0     0   115     0
  <NA>     0     0     0     0     0  8690

Birth Year and Age

Participants were able to write in an open text field their birth year, and the validation required values between 1925 and 2007, except for Iran (FA-IRN), where the validation ranged from 1304 to 1386.

# Sanity check: Class of the variable
class(df_merged$birth_year_orig)
[1] "character"
class(df_merged$age)
[1] "numeric"
# Sanity check: View values with non-numeric characters
df_merged |>
  dplyr::filter(!is.na(birth_year_orig) & grepl("\\D", birth_year_orig)) |>
  dplyr::select(birth_year_orig) |> 
  dplyr::distinct() |> 
  base::print(n = Inf)
# A tibble: 41 × 1
   birth_year_orig
   <chr>          
 1 2001.          
 2 1993.          
 3 2000.          
 4 2003.          
 5 2002.          
 6 1995.          
 7 2005.          
 8 1997.          
 9 1969.          
10 1977.          
11 2005,          
12 1985.          
13 1982.          
14 1983.          
15 2004.          
16 1996.          
17 1980.          
18 1994.          
19 2007.          
20 1992.          
21 2006.          
22 1975.          
23 1972.          
24 1978.          
25 1999.          
26 1990.          
27 1989.          
28 1945.          
29 1981.          
30 1971.          
31 1974.          
32 1955.          
33 1949.          
34 1959.          
35 1988.          
36 ,1979          
37 1982.0424      
38 ,1953          
39 1973.01        
40 1963.          
41 1951.          
# Create cleaned column and keep original.
# Calculate age.
df_merged <- df_merged |>
  dplyr::mutate(
    # extract first 4-digit sequence and transform to numerical
    birth_year_clean = 
      as.numeric(stringr::str_extract(birth_year_orig, "\\d{4}")),

    age = dplyr::case_when(
      
      # Keep the values of the participants from the Irish sponsored dataset
      !is.na(age) ~ age,
      
      # If rows in birth year contains NA, then keep NA
      is.na(birth_year_clean) & is.na(age) ~ NA_real_,
      
      # If Q_Language is "FA-IRN",
      # then use the Solar Hijri calendar (1404)
      UserLanguage == "FA-IRN" & !is.na(birth_year_clean) ~ 1404 - birth_year_clean,
      
      # Otherwise, use the Gregorian calendar (2025)
      !is.na(birth_year_clean) ~ 2025 - birth_year_clean,
      
      TRUE ~ NA_real_
    ),
    
    # Create age groups
    age_group = base::factor(dplyr::case_when(
      age >= 18 & age <= 25 ~ "18-25",
      age >= 26 & age <= 44 ~ "26-44",
      age >= 45 & age <= 64 ~ "45-64",
      age >= 65 & age <= 74 ~ "65-74",
      age >= 75 ~ "75+",
      TRUE ~ NA_character_
    ),
    levels = c(
      "18-25",
      "26-44",
      "45-64",
      "65-74",
      "75+"
    ))
  ) |>
  dplyr::relocate(birth_year_clean:age_group, .after = birth_year_orig)


# Sanity check: View the summary of the cleaned birth year
cat(
  "Min: ",
  min(df_merged$birth_year_clean, na.rm = TRUE),
  "\nMax: ",
  max(df_merged$birth_year_clean, na.rm = TRUE),
  "\nNA count: ",
  sum(is.na(df_merged$birth_year_clean)),
  "\nClass: ",
  class(df_merged$birth_year_clean)
)
Min:  1328 
Max:  2007 
NA count:  10380 
Class:  numeric
# Sanity check: Are there rows where raw birth year exists but cleaning failed?
df_merged |> 
  dplyr::filter(!is.na(birth_year_orig) & is.na(birth_year_clean)) |>
  base::nrow()
[1] 0
# Sanity check: View the summary of the age variable
cat(
  "Min: ",
  min(df_merged$age, na.rm = TRUE),
  "\nMax: ",
  max(df_merged$age, na.rm = TRUE),
  "\nNA count: ",
  sum(is.na(df_merged$age)),
  "\nClass: ",
  class(df_merged$age)
)
Min:  18 
Max:  100 
NA count:  9180 
Class:  numeric
# Sanity check: View the mapping distribution of the new age group variable
base::table(df_merged$age_group, useNA = "ifany")

18-25 26-44 45-64 65-74   75+  <NA> 
13674 30944 13098  1988   524  9180 
# Sanity check: Are there rows where raw value exists but age group is missing?
df_merged |> 
  dplyr::filter(!is.na(birth_year_orig) & is.na(age_group)) |>
  base::nrow()
[1] 0
# Sanity check: View the new birth year and age variables
dplyr::glimpse(df_merged |>
                 dplyr::select(birth_year_orig,
                               birth_year_clean,
                               age,
                               age_group),
               width = 100)
Rows: 69,408
Columns: 4
$ birth_year_orig  <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975", "1995", "…
$ birth_year_clean <dbl> 1989, 1984, 1971, 1986, 1993, 2005, 1986, 1975, 1995, 1963, 1993, 1981, 2…
$ age              <dbl> 36, 41, 54, 39, 32, 20, 39, 50, 30, 62, 32, 44, 24, 35, 31, 24, 31, 39, 5…
$ age_group        <fct> 26-44, 26-44, 45-64, 26-44, 26-44, 18-25, 26-44, 45-64, 26-44, 45-64, 26-…
# Sanity check: View counts of the sponsored Irish dataset
df_merged |>
  dplyr::filter(irl==1) |>
  dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 67 × 6
# Groups:   UserLanguage, birth_year_orig, birth_year_clean, age [67]
   UserLanguage     birth_year_orig birth_year_clean   age age_group     n
   <chr>            <chr>                      <dbl> <dbl> <fct>     <int>
 1 EN-IRL-sponsored <NA>                          NA    18 18-25         7
 2 EN-IRL-sponsored <NA>                          NA    19 18-25         4
 3 EN-IRL-sponsored <NA>                          NA    20 18-25        12
 4 EN-IRL-sponsored <NA>                          NA    21 18-25         9
 5 EN-IRL-sponsored <NA>                          NA    22 18-25         5
 6 EN-IRL-sponsored <NA>                          NA    23 18-25         9
 7 EN-IRL-sponsored <NA>                          NA    24 18-25        10
 8 EN-IRL-sponsored <NA>                          NA    25 18-25        17
 9 EN-IRL-sponsored <NA>                          NA    26 26-44        11
10 EN-IRL-sponsored <NA>                          NA    27 26-44        16
# ℹ 57 more rows
# Sanity check: View counts of Iran dataset
df_merged |>
  dplyr::filter(UserLanguage == "FA-IRN") |>
  dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(-birth_year_clean)  |>
  print_reactable(sorted_col = "birth_year_clean", width = 800)
# Sanity check: View counts of main dataset
df_merged |>
  dplyr::filter(irl == 0 & UserLanguage != "FA-IRN") |>
  dplyr::group_by(birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(-birth_year_clean) |>
  print_reactable(sorted_col = "birth_year_clean", width = 800)

Sex

Upon collaborators’ request, the option “I prefer to use: [open text field]” was hidden from the survey versions in Kuwait (AR-KWT; EN-KWT), Egypt (AR-EGY; EN-EGY), Yemen (AR-YEM; EN-YEM), in Algeria (AR-DZA), in Saudi Arabia (AR-SAU), Chad (AR-TCD; FR-TCD), and Bahrain (AR-BHR; EN-BHR).

# Sanity check: View the counts of each option
table_label(df_merged$sex_orig)
$sex_orig
Which best describes you? - Selected Choice
    1     2     3  <NA> 
23444 36194   549  9221 
Class: numeric 
# Load recoded values regarding sex because
# some participants wrote "Female" or "Male" in the open text field
sex_recoded <- 
  readr::read_csv("111_sex_open_answers_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 544
Columns: 2
$ ResponseId       <chr> "R_42tedcZhWdJn9Sk", "R_9hnp095IY8LIkSX", "R_2gvEljyDLXs1Yjm", "R_516n1yU…
$ sex_text_recoded <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot deter…
base::table(sex_recoded$sex_text_recoded, useNA = "always")

Cannot determine           Female             Male       Non-binary            Other             <NA> 
             137               12               17              344               34                0 
df_merged <- df_merged |>
  dplyr::left_join(sex_recoded, by = "ResponseId") |>

  # create a reviewed numeric coding (1 = Male, 2 = Female, 3 = Other)
  dplyr::mutate(
    sex_reviewed = dplyr::case_when(
      sex_text_recoded == "Female" ~ 2,
      sex_text_recoded == "Male" ~ 1,
      sex_text_recoded %in% c("Other", "Non-binary") ~ 3,
      sex_text_recoded == "Cannot determine" ~ NA_real_,
      TRUE ~ sex_orig
    ),

    # categorical factor with explicit levels
    sex_reviewed_cat = factor(
      dplyr::case_when(
        sex_reviewed == 1 ~ "Male",
        sex_reviewed == 2 ~ "Female",
        sex_reviewed == 3 ~ "Other",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female", "Other")
    ),

    # binary numeric: 1 = Male, 0 = Female, NA otherwise
    sex_binary = dplyr::case_when(
      sex_reviewed == 1 ~ 1,
      sex_reviewed == 2 ~ 0,
      TRUE ~ NA_real_
      ),

    # binary factor
    sex_binary_cat = factor(
      dplyr::case_when(
        sex_binary == 1 ~ "Male",
        sex_binary == 0 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    )
  ) |>

  dplyr::relocate(sex_text_recoded:sex_binary_cat, .after = sex_orig)

# Sanity check: Cross-tabs to inspect recoded text vs numeric reviewed code
df_merged |>
  dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
  dplyr::summarise(n = dplyr::n(), .groups = "drop")
# A tibble: 4 × 5
  sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat     n
         <dbl> <fct>                 <dbl> <fct>          <int>
1            1 Male                      1 Male           23461
2            2 Female                    0 Female         36206
3            3 Other                    NA <NA>             383
4           NA <NA>                     NA <NA>            9358
# Sanity check: Cross-tabs to inspect original values vs numeric reviewed code
table(df_merged$sex_orig, df_merged$sex_reviewed, useNA = "always")
      
           1     2     3  <NA>
  1    23444     0     0     0
  2        0 36194     0     0
  3       17    12   383   137
  <NA>     0     0     0  9221
# Sanity check: View the counts of each option
df_merged |>
  dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 4 × 5
# Groups:   sex_reviewed, sex_reviewed_cat, sex_binary [4]
  sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat     n
         <dbl> <fct>                 <dbl> <fct>          <int>
1            1 Male                      1 Male           23461
2            2 Female                    0 Female         36206
3            3 Other                    NA <NA>             383
4           NA <NA>                     NA <NA>            9358
# Cleanup
rm(sex_recoded)

Education Level

The translated education categories of each country will be mapped to a common set of categories. Some countries had a different definition of secondary education, so the mapping will consider if the level is eligible for university entrance or not. The classification of the education levels in each country was agreed upon with the collaborators. The recoded education categories are:

  • Less than secondary (not eligible for university entrance)
  • Secondary (completed the equivalent to high school, and it is eligible for university entrance)
  • Technical (not higher education)
  • University (higher education up to a bachelor’s degree)
  • Advanced (anything beyond a bachelor’s degree)

Note:

  • The team from Ethiopia (AM-ETH and EN-ETH) requested to hide option 7 from their versions of the survey.
  • The team from Peru requested to include an option for “Inclusive Education”. Since this applies across several levels, this option was recoded to NA.
# Sanity check: View the counts of each option
table_label(df_merged$education_orig)
$education_orig
Which is the highest level of education you have completed?
    1     2     3     4     5     6     7     8  <NA> 
  328  1084  8515  8206 19982 13762  4459  3773  9299 
Class: numeric 
# Load the education categories for each country
edu_cat <- 
  readr::read_csv("111_education_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 1,024
Columns: 5
$ UserLanguage          <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM"…
$ education_orig        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,…
$ education_cat         <chr> "No primary education", "Primary (Elementary/Middle School)", "High …
$ education_recoded_cat <chr> "Less than secondary", "Less than secondary", "Secondary", "Technica…
$ education_recoded     <dbl> 1, 1, 2, 3, 4, 4, 5, 5, 1, 1, 2, 3, 4, 5, NA, 5, 1, 1, 2, 3, 4, 5, 5…
# Sanity check: View if there are unexpected values in education_orig
base::table(edu_cat$education_orig, useNA = "always")

   1    2    3    4    5    6    7    8 <NA> 
 128  128  128  128  128  128  128  128    0 
# Sanity check: View if the categories match the expected values
# We expect three cells with missing values regarding 
# Peru's inclusive education level, and level 7 was hidden for Ethiopia (AM-ETH
# and EN-ETH).
edu_cat |>
  dplyr::group_by(education_recoded_cat, education_recoded) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(education_recoded)
# A tibble: 6 × 3
# Groups:   education_recoded_cat [6]
  education_recoded_cat education_recoded     n
  <chr>                             <dbl> <int>
1 Less than secondary                   1   275
2 Secondary                             2   138
3 Technical                             3   119
4 University                            4   129
5 Advanced                              5   360
6 <NA>                                 NA     3
edu_cat |> 
  dplyr::filter(is.na(education_recoded))
# A tibble: 3 × 5
  UserLanguage education_orig education_cat       education_recoded_cat education_recoded
  <chr>                 <dbl> <chr>               <chr>                             <dbl>
1 AM-ETH                    7 <NA>                <NA>                                 NA
2 EN-ETH                    7 <NA>                <NA>                                 NA
3 ES-PER                    5 Inclusive education <NA>                                 NA
# Add the education categories to the main data frame
df_merged <- df_merged |>
  dplyr::left_join(
    edu_cat |> dplyr::select(
      UserLanguage,
      education_orig,
      education_cat,
      education_recoded_cat,
      education_recoded
    ),
    by = c("UserLanguage", "education_orig")
  ) |>
  dplyr::mutate(
    education_recoded_cat = base::factor(
      education_recoded_cat,
      levels = c(
        "Less than secondary",
        "Secondary",
        "Technical",
        "University",
        "Advanced"
      ),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(education_cat:education_recoded, .after = education_orig)

# Sanity check: Are there education values without a corresponding
# education_recoded and education_recoded_cat?
df_merged |>
  group_by(
    UserLanguage,
    education_orig,
    education_cat,
    education_recoded_cat,
    education_recoded
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::filter(is.na(education_recoded) & !is.na(education_orig))
# A tibble: 1 × 6
# Groups:   UserLanguage, education_orig, education_cat, education_recoded_cat [1]
  UserLanguage education_orig education_cat       education_recoded_cat education_recoded     n
  <chr>                 <dbl> <chr>               <ord>                             <dbl> <int>
1 ES-PER                    5 Inclusive education <NA>                                 NA    31
# Sanity check: Check Irish sponsored dataset
df_merged |>
  filter(irl == 1) |>
  group_by(
    education_irl,
    education_orig,
    education_cat,
    education_recoded_cat,
    education_recoded) |> dplyr::summarise(n = dplyr::n())
# A tibble: 8 × 6
# Groups:   education_irl, education_orig, education_cat, education_recoded_cat [8]
  education_irl                            education_orig education_cat                            education_recoded_cat education_recoded     n
  <chr>                                             <dbl> <chr>                                    <ord>                             <dbl> <int>
1 Degree                                                6 Degree                                   University                            4   355
2 Diploma                                               5 Diploma                                  Technical                             3   185
3 Doctorate                                             8 Doctorate                                Advanced                              5    11
4 Junior (Inter) Certificate or Equivalent              2 Junior (Inter) Certificate or Equivalent Less than secondary                   1    67
5 Leaving Certificate                                   3 Leaving Certificate                      Secondary                             2   277
6 Less than Junior (Inter) Cert                         1 Less than Junior (Inter) Cert            Less than secondary                   1    13
7 Master's                                              7 Master's                                 Advanced                              5   154
8 Technical or Vocational Certificate                   4 Technical or Vocational Certificate      Technical                             3   138
# Cleanup
rm(edu_cat)

Employment Status

Upon collaborators’ request, the option “Part-time student” was hidden from the versions KA-GEO and EN-GEO in Georgia and SR-SRB in Serbia. The option “Military service” was hidden from the version JA-JPN in Japan.

During the survey completion, participants were not allowed to select conflicting options:

  • Employed full-time and part-time simultaneously.
  • Student full-time and part-time simultaneously.
  • Employed/working full-time or part-time and not in paid employment simultaneously.
  • Military service and not in paid employment simultaneously.
  • Military service and retired simultaneously.
  • Retired and not in paid employment simultaneously.
  • Not in paid employment by choice and looking for work or unable to work due to health/personal reasons simultaneously.
  • Looking for work and unable to work due to health/personal reasons simultaneously.

Employment status was recoded using a sequential rule:

  • Military if the military service option was selected.
  • Employed/working full-time (25+ hours per week) if the full-time employment option was selected.
  • Employed/working part-time (less than 25 hours per week) if the part-time employment option was selected.
  • Not in paid employment (looking for work) if the job-seeking option was selected and no conditions above were met.
  • Student non-working (Full or part-time) if the full- or part-time student was selected and no conditions above were met.
  • Not in paid employment (by choice/health) if not working by choice or for health reasons and no conditions above were met.
  • Retired if the retired option was selected and no conditions above were met.
# Sanity check: View the counts of each option
table_label(df_merged$employment_orig)
$employment_orig
Which most accurately describes you at this moment? You may select up to two options in case you fit more than one category.
    1   1,3   1,4   1,5   1,6   1,7   1,8   1,9     2   2,3   2,4   2,5   2,6   2,7   2,8   2,9     3   3,5   3,6     4   4,5   4,6     5     6     7 
 6511   962   984    70    18   328   671   114  1312  1440   628    34    28   113   267    54 30950   217   189  4737    47   182   589  2592  1839 
    8     9  <NA> 
 3233  1728  9571 
Class: character 
# Replace numeric values with descriptive labels
employment_labels <- c(
  "1" = "Full-time student",
  "2" = "Part-time student",
  "3" = "Employed/working full-time (25+ hours per week)",
  "4" = "Employed/working part-time (less than 25 hours per week)",
  "5" = "Military service",
  "6" = "Retired",
  "7" = "Not in paid employment (by choice)",
  "8" = "Not in paid employment (looking for work)",
  "9" = "Not in paid employment (unable to work due to health/personal reasons)")

# Function to recode multiple-choice values
recode_employment <- function(i) {
  # If row is NA, return NA
  if (is.na(i)) return(NA_character_)
  # Split the string by comma and map to labels
  codes <- strsplit(i, ",")[[1]]
  # Collapse the labels into a single string
  paste(employment_labels[trimws(codes)], collapse = "; ")
}

df_merged <- df_merged |>
  dplyr::mutate(

    # Apply recoding function to create employment_cat variable
    # so instead of "2,5", we have "Part-time student; Military service"
    employment_cat =
           stringr::str_squish(sapply(employment_orig, recode_employment)),

    employment_primary = base::factor(
      dplyr::case_when(

        # Contains option 5
        stringr::str_detect(employment_orig, fixed("5"))
        ~ "Military service",

        # Contains option 3 AND do not contain option 5
        stringr::str_detect(employment_orig, fixed("3")) &
          !(stringr::str_detect(employment_orig, fixed("5")))
        ~ "Employed/working full-time (25+ hours per week)",

        # Contains option 4 AND do not contain option 5
        # (it was not possible to select options 3 and 4 simultaneously)
        stringr::str_detect(employment_orig, fixed("4"))  &
          !(stringr::str_detect(employment_orig, fixed("5")))
        ~ "Employed/working part-time (less than 25 hours per week)",

        # Contains option 8 AND do not contain option 5
        # (it was not possible to select options 8 and 5, 3 or 4 simultaneously)
        stringr::str_detect(employment_orig, fixed("8"))
        ~ "Not in paid employment (looking for work)",

        # Contains option 1 or 2 AND do not contain option 5, 3, 4, or 8
        (stringr::str_detect(employment_orig, fixed("1")) |
        stringr::str_detect(employment_orig, fixed("2"))) &
        !(stringr::str_detect(employment_orig, fixed("5"))) &
        !(stringr::str_detect(employment_orig, fixed("3"))) &
        !(stringr::str_detect(employment_orig, fixed("4"))) &
        !(stringr::str_detect(employment_orig, fixed("8")))
        ~ "Student non-working (Full or part-time)",

        # Contains option 7 or 9 AND do not contain option 1, or 2
        # (it was not possible to select options 7 or 9
        # and 8, 5, 3 or 4 simultaneously)
        (stringr::str_detect(employment_orig, fixed("7")) |
          stringr::str_detect(employment_orig, fixed("9"))) &
        !(stringr::str_detect(employment_orig, fixed("1"))) &
        !(stringr::str_detect(employment_orig, fixed("2")))
        ~ "Not in paid employment (by choice/health)",

        # Contains option 6 AND do not contain option 5, 3, 4, 8, 1, 2, 7 or 9
        # (it was not possible to select options 6 and 7, 8, 9, 5 simultaneously)
        stringr::str_detect(employment_orig, fixed("6")) &
        !(stringr::str_detect(employment_orig, fixed("3"))) &
        !(stringr::str_detect(employment_orig, fixed("4"))) &
        !(stringr::str_detect(employment_orig, fixed("1"))) &
        !(stringr::str_detect(employment_orig, fixed("2")))
        ~ "Retired",

        TRUE ~ NA_character_
      ),

      levels = c(
        "Not in paid employment (by choice/health)",
        "Not in paid employment (looking for work)",
        "Student non-working (Full or part-time)",
        "Employed/working full-time (25+ hours per week)",
        "Employed/working part-time (less than 25 hours per week)",
        "Retired",
        "Military service"
      )
    )
  ) |>
  dplyr::relocate(employment_cat:employment_primary, .after = employment_orig)

# Sanity check: How many options were selected per participant?
df_merged |>
  dplyr::mutate(number_of_options_selected =
           if_else(is.na(employment_orig),
                   NA_integer_,
                   str_count(employment_orig, ",") + 1)) |>
  count(number_of_options_selected)
# A tibble: 3 × 2
  number_of_options_selected     n
                       <dbl> <int>
1                          1 53491
2                          2  6346
3                         NA  9571
# Sanity check: View the distribution of primary employment
base::table(df_merged$employment_primary, useNA = "ifany")

               Not in paid employment (by choice/health)                Not in paid employment (looking for work) 
                                                    3567                                                     4171 
                 Student non-working (Full or part-time)          Employed/working full-time (25+ hours per week) 
                                                    8478                                                    33541 
Employed/working part-time (less than 25 hours per week)                                                  Retired 
                                                    6531                                                     2592 
                                        Military service                                                     <NA> 
                                                     957                                                     9571 
# Sanity check: Cross-tab between primary employment and original employment
print(table(df_merged$employment_primary,
            df_merged$employment_orig, useNA = "ifany"), n = Inf)
                                                          
                                                               1   1,3   1,4   1,5   1,6   1,7   1,8   1,9     2   2,3   2,4   2,5   2,6   2,7   2,8
  Not in paid employment (by choice/health)                    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
  Not in paid employment (looking for work)                    0     0     0     0     0     0   671     0     0     0     0     0     0     0   267
  Student non-working (Full or part-time)                   6511     0     0     0    18   328     0   114  1312     0     0     0    28   113     0
  Employed/working full-time (25+ hours per week)              0   962     0     0     0     0     0     0     0  1440     0     0     0     0     0
  Employed/working part-time (less than 25 hours per week)     0     0   984     0     0     0     0     0     0     0   628     0     0     0     0
  Retired                                                      0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
  Military service                                             0     0     0    70     0     0     0     0     0     0     0    34     0     0     0
  <NA>                                                         0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
                                                          
                                                             2,9     3   3,5   3,6     4   4,5   4,6     5     6     7     8     9  <NA>
  Not in paid employment (by choice/health)                    0     0     0     0     0     0     0     0     0  1839     0  1728     0
  Not in paid employment (looking for work)                    0     0     0     0     0     0     0     0     0     0  3233     0     0
  Student non-working (Full or part-time)                     54     0     0     0     0     0     0     0     0     0     0     0     0
  Employed/working full-time (25+ hours per week)              0 30950     0   189     0     0     0     0     0     0     0     0     0
  Employed/working part-time (less than 25 hours per week)     0     0     0     0  4737     0   182     0     0     0     0     0     0
  Retired                                                      0     0     0     0     0     0     0     0  2592     0     0     0     0
  Military service                                             0     0   217     0     0    47     0   589     0     0     0     0     0
  <NA>                                                         0     0     0     0     0     0     0     0     0     0     0     0  9571
# Sanity check: View the counts of each option
df_merged |>
  dplyr::group_by(employment_orig, employment_cat, employment_primary) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_orig", width = 800)
# Cleanup
rm(recode_employment, employment_labels)

Citizenship and Ethnicity

This item allowed participants to select multiple choices. The first eight options referred to ethnicity. Only some countries contained the options related to ethnicity.

The last three options referred to citizenship status. Participants were not allowed to select Citizen of [country] and Resident of [country] (non-citizen) simultaneously. All countries contained the citizenship options.

Citizenship
# Sanity check: View the counts of each option
table_label(df_merged$ethnicity_citizenship_orig)
$ethnicity_citizenship_orig
Please choose which best describes you. You must select at least one option from the top part and at least one option from the bottom. - Selected Choice
              1,10               1,11             1,2,10             1,2,11           1,2,3,10         1,2,3,4,10       1,2,3,4,5,10 
             24538                444                321                  6                 11                  4                  1 
    1,2,3,4,5,6,10 1,2,3,4,5,6,7,8,11     1,2,3,4,6,7,10       1,2,3,4,7,10   1,2,3,4,7,8,9,10         1,2,3,5,10       1,2,3,5,8,10 
                 1                  1                  1                  1                  1                  2                  1 
      1,2,3,5,9,10         1,2,3,8,10         1,2,3,9,10           1,2,4,10           1,2,4,11         1,2,4,5,10       1,2,4,6,7,10 
                 1                  2                  1                115                  3                 10                  1 
      1,2,4,6,8,10         1,2,4,8,10         1,2,4,8,11         1,2,4,9,10           1,2,5,10           1,2,6,10           1,2,6,11 
                 1                 11                  1                  5                  5                  2                  1 
        1,2,6,8,10           1,2,7,10         1,2,7,8,10           1,2,8,10           1,2,8,11              1,2,9           1,2,9,10 
                 1                  1                  1                 19                  2                 14                 16 
          1,2,9,11             1,3,10             1,3,11           1,3,4,10         1,3,4,5,10         1,3,4,6,10           1,3,5,10 
                 1                164                  2                  8                  1                  1                  3 
           1,3,5,9           1,3,6,10           1,3,8,10           1,3,8,11         1,3,8,9,10         1,3,8,9,11              1,3,9 
                 1                  1                 11                  1                  2                  1                  2 
          1,3,9,10             1,4,10             1,4,11           1,4,5,10           1,4,5,11         1,4,5,8,10            1,4,5,9 
                 5                134                  5                 14                  1                  3                  1 
        1,4,5,9,10           1,4,6,10         1,4,6,8,10           1,4,7,10         1,4,7,8,10           1,4,8,10         1,4,8,9,10 
                 1                  1                  1                  3                  1                  9                  2 
             1,4,9           1,4,9,10           1,4,9,11             1,5,10             1,5,11         1,5,6,8,10         1,5,6,9,10 
                 3                  8                  1                104                  4                  1                  1 
          1,5,7,10           1,5,8,10              1,5,9           1,5,9,10           1,5,9,11             1,6,10             1,6,11 
                 2                 10                  4                  3                  2                 72                  1 
          1,6,7,10           1,6,8,10            1,6,8,9           1,6,9,10           1,6,9,11             1,7,10             1,7,11 
                 1                  5                  1                  3                  1                 46                  1 
          1,7,8,10         1,7,8,9,10              1,7,9           1,7,9,10           1,7,9,11             1,8,10             1,8,11 
                 1                  2                  2                  2                  1                584                 30 
             1,8,9           1,8,9,10           1,8,9,11                1,9             1,9,10             1,9,11                 10 
                28                 17                  2                664                368                 95              10450 
                11               2,10               2,11             2,3,10             2,3,11     2,3,4,5,6,9,10       2,3,4,5,7,10 
               415               2307                222                 14                  3                  1                  1 
         2,3,4,5,9         2,3,4,8,11         2,3,4,9,11           2,3,5,11         2,3,5,9,11         2,3,6,8,10           2,3,7,10 
                 1                  1                  1                  3                  1                  1                  2 
          2,3,8,10         2,3,8,9,11              2,3,9           2,3,9,11             2,4,10             2,4,11         2,4,5,6,10 
                 2                  1                  3                  2                 80                  8                  1 
      2,4,5,6,7,10         2,4,5,7,10           2,4,6,10           2,4,7,10           2,4,8,10              2,4,9           2,4,9,10 
                 1                  1                  2                  1                  2                  2                  2 
            2,5,10             2,5,11           2,5,7,10           2,5,9,11             2,6,10           2,6,7,10           2,6,8,10 
                12                  3                  1                  4                 10                  1                  1 
             2,6,9           2,6,9,11             2,7,10         2,7,8,9,10              2,7,9           2,7,9,10             2,8,10 
                 2                  1                 40                  1                  2                  6                 83 
            2,8,11              2,8,9           2,8,9,10           2,8,9,11                2,9             2,9,10             2,9,11 
                 4                 32                  3                  5                306                 46                 60 
              3,10               3,11             3,4,10         3,4,5,7,10           3,4,6,11           3,4,7,10           3,4,7,11 
              1406                 64                 18                  1                  1                  3                  1 
          3,4,8,10            3,4,8,9              3,4,9           3,4,9,10           3,4,9,11             3,5,10         3,5,6,9,10 
                 2                  1                  2                  2                  1                 10                  1 
          3,5,7,10            3,5,7,9              3,5,9             3,6,10             3,7,10              3,7,9           3,7,9,10 
                 1                  1                  1                 11                 22                  1                  2 
            3,8,10             3,8,11              3,8,9           3,8,9,10           3,8,9,11                3,9             3,9,10 
                53                  6                  5                  4                  1                127                 36 
            3,9,11               4,10               4,11             4,5,10             4,5,11           4,5,7,10           4,5,8,10 
                11               2701                 97                 30                  1                  7                  3 
             4,5,9             4,6,10             4,6,11           4,6,7,10           4,6,8,10              4,6,9             4,7,10 
                 2                 25                  2                  1                  2                  1                135 
            4,7,11           4,7,8,10            4,7,8,9              4,7,9           4,7,9,10           4,7,9,11             4,8,10 
                 2                  2                  1                  6                  9                  2                100 
            4,8,11              4,8,9           4,8,9,10                4,9             4,9,10             4,9,11               5,10 
                 4                  7                  2                179                 56                 28               2617 
              5,11             5,6,10              5,6,9             5,7,10           5,7,8,10             5,8,10             5,8,11 
                45                  2                  1                 46                  4                 48                  2 
             5,8,9           5,8,9,10           5,8,9,11                5,9             5,9,10             5,9,11               6,10 
                 5                  4                  1                 90                 34                 20               1233 
              6,11             6,7,10              6,7,9           6,7,9,10             6,8,10             6,8,11              6,8,9 
                18                 10                  1                  1                 40                  1                  1 
          6,8,9,11                6,9             6,9,10             6,9,11               7,10               7,11             7,8,10 
                 1                 33                 14                 14               4584                 42                 34 
          7,8,9,11                7,9             7,9,10             7,9,11               8,10               8,11                8,9 
                 1                137                108                 34               1715                101                 98 
            8,9,10             8,9,11                  9               9,10               9,11               <NA> 
                47                 38                401                338                269               9825 
Class: character 
# Create variables for each citizenship option (9 to 11)
df_merged <- df_merged |>
  dplyr::mutate(
    citizenship_cat = base::factor(
      dplyr::case_when(

        # When option 10 = "Citizen of [country]" was selected
        # and option 9 = "Born outside [country]" was not selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
        ~ "Citizen",

        # When option 11 = "Resident of [country] (non-citizen)" was selected
        # and option 9 = "Born outside [country]" was not selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
        ~ "Non-citizen (Permanent Resident)",

        # When option 9 = "Born outside [country]" was selected
        # and option 10 = "Citizen of [country]" was selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10"))
        ~ "Born outside country (Citizen)",

        # When option 9 = "Born outside [country]" was selected
        # and option 11 = "Resident of [country] (non-citizen)" was selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))
        ~ "Born outside country (Non-citizen, Permanent Resident)",

        # When only option 9 = "Born outside [country]" was selected.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")))
        ~ "Born outside country (Non-citizen, Non-permanent Resident)",

        TRUE ~ NA_character_
      ),

    levels = c(
      "Citizen",
      "Non-citizen (Permanent Resident)",
      "Born outside country (Citizen)",
      "Born outside country (Non-citizen, Permanent Resident)",
      "Born outside country (Non-citizen, Non-permanent Resident)")
    )
  ) |>
  dplyr::relocate(citizenship_cat, .after = ethnicity_citizenship_orig)

# Sanity check: View the distribution of citizenship categories
df_merged |>
  dplyr::mutate(
    # Extract only the citizenship options selected
    citizenship_extract = stringr::str_extract_all(
      ethnicity_citizenship_orig, "(?<=^|,)(9|10|11)(?=,|$)") |>
      purrr::map_chr(\(i) {
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  ) |> 
  dplyr::group_by(citizenship_extract, citizenship_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 6 × 3
# Groups:   citizenship_extract [6]
  citizenship_extract citizenship_cat                                                n
  <chr>               <fct>                                                      <int>
1 10                  Citizen                                                    54110
2 11                  Non-citizen (Permanent Resident)                            1549
3 9                   Born outside country (Non-citizen, Non-permanent Resident)  2169
4 9,10                Born outside country (Citizen)                              1155
5 9,11                Born outside country (Non-citizen, Permanent Resident)       600
6 <NA>                <NA>                                                        9825
Ethnicity
# Upload the ethnicity categories translated that were used for each country
ethnicity_cat <- 
  readr::read_csv("111_ethnicity_labels_translated.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 533
Columns: 3
$ UserLanguage  <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AR-ARE", "AR-ARE", "AR-AR…
$ option_number <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1…
$ label         <chr> "Armenians", "Ezidis", "Russians", "Assyrians", "Ukrainians", "Arab/Middle E…
# Extract the ethnicity options
df_merged <- df_merged |>
  dplyr::mutate(
    # Extract the ethnicity options
    # Don't extract option 8 = "Specify: [open text field]"
    # because that will be added later
    ethnicity_agg = stringr::str_extract_all(
      ethnicity_citizenship_orig,
      "(?<=^|,)(1|2|3|4|5|6|7)(?=,|$)"
    ) |>
      purrr::map_chr(\(i) {
        # Participants that did not complete this item should have NA.
        # Participants that completed a survey version
        # without ethnicity options should have NA.
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  )

# Replace ethnicity options with the translated labels
# Transformation will be conducted in a temporary data frame for safety
df_temp <- df_merged |>
  # Remove missing values for this transformation
  dplyr::filter(!is.na(ethnicity_agg)) |>
  # Separate values into rows
  # (if participant wrote "1,2", create two rows: one with "1" and another with "2")
  tidyr::separate_rows(ethnicity_agg, sep = ",") |>
  # Create variable that is going to match with ethnicity_cat
  dplyr::mutate(option_number = as.numeric(stringr::str_trim(ethnicity_agg))) |>
  # Join ethnicity_cat to get the translated labels
  dplyr::left_join(ethnicity_cat, by = c("UserLanguage", "option_number")) |>
  # Bring back to former format of having multiple options in a single row
  # but now with the translated labels instead of numbers
  dplyr::group_by(ResponseId) |>
  dplyr::summarise(
    ethnicity_translated = base::paste(label[!is.na(label)], collapse = ",")
  )

# Join back to main data frame
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(df_temp, by = "ResponseId")

nrow(df_merged)
[1] 69408
# Cleanup
rm(df_temp)

# Sanity check: Are the number of missing values in the new variable the same
# as in the original variable plus those that only selected citizenship options
# or only the please specify option (8)?
sum(is.na(df_merged$ethnicity_translated)) ==
  (sum(is.na(df_merged$ethnicity_citizenship_orig)) + sum(
    !is.na(df_merged$ethnicity_citizenship_orig) &
      stringr::str_detect(
        df_merged$ethnicity_citizenship_orig,
        "^(?:\\s*(?:8|9|10|11)\\s*)(?:,\\s*(?:8|9|10|11)\\s*)*$"
      )
  ))
[1] TRUE
# Add the cleaned responses from the "Specify: [open text field]" option (8)
ethnicity_recoded <- 
  readr::read_csv("111_ethnicity_open_answers_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 3,274
Columns: 2
$ ResponseId        <chr> "R_8FrYunIVSiVeX5B", "R_8rYZOG6u8qXwprj", "R_8p9yE9TFIjGUonc", "R_2Lzosf…
$ ethnicity_specify <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot dete…
df_merged <- df_merged |>
  dplyr::left_join(ethnicity_recoded, by = "ResponseId") |>
  dplyr::relocate(ethnicity_agg:ethnicity_specify, .after = ethnicity_citizenship_orig)

# Sanity check
dplyr::glimpse(df_merged |>
  dplyr::group_by(ethnicity_citizenship_orig, UserLanguage) |>
  dplyr::distinct(ethnicity_citizenship_orig, UserLanguage,
           ethnicity_agg, ethnicity_translated, ethnicity_specify,
           .keep_all = TRUE) |>
  dplyr::ungroup() |>
  dplyr::select(UserLanguage, ethnicity_citizenship_orig,
                ethnicity_agg, ethnicity_translated, ethnicity_specify), 
  width = 100)
Rows: 2,567
Columns: 5
$ UserLanguage               <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "FIL-PHL", "P…
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "3,10", "1,10", "1,10", "1,9", "2,3,5…
$ ethnicity_agg              <chr> "3,6", "1", "5", "3", "1", "1", "1", "2,3,5", "2", "2", "1,2", …
$ ethnicity_translated       <chr> "Diola / Malinké,Haalpulaaren", "Wolof / Lébou", "White", "Blac…
$ ethnicity_specify          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mo…
# View values
df_merged |>
  dplyr::filter(!is.na(ethnicity_citizenship_orig)) |>
  dplyr::mutate(
    ethnicity_extract = stringr::str_extract_all(
      ethnicity_citizenship_orig,
      "(?<=^|,)(1|2|3|4|5|6|7|8)(?=,|$)"
    ) |>
      purrr::map_chr(\(i) {
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  ) |>
  dplyr::group_by(country, ethnicity_extract, ethnicity_agg, 
                  ethnicity_translated, ethnicity_specify) |>
  dplyr::summarise(n = dplyr::n(), .groups = "drop") |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "country", width = 800)
# Cleanup
rm(ethnicity_cat, ethnicity_recoded)

Honeypot for Bots

An item was added to the survey, and via JavaScript code, the item was hidden from human participants. Bots that do not compute JavaScript code would process this item and provide an answer, which would allow us to identify them.

# NA indicates a valid response.
# All responses were valid, no bots were found through this item.
base::table(df_merged$bot_check, useNA = "always")

 <NA> 
69408 

Optional Ending

This item was optional and can be NA. Also, this item was not shown to sponsored participants.

# The item is already numeric and within the scale.
table_label(df_merged$followup)
$followup
You have now finished the main survey! Would you be willing to answer a few more similar questions?
    1     2  <NA> 
34936 13566 20906 
Class: numeric 

Childhood Socioeconomic Status

# Sanity check: View the counts of each option
table_label(df_merged$childhood_SES)
$childhood_SES
As a child, how would you describe the financial situation in your household compared to a typical home where you grew up?
    1     2     3     4     5  <NA> 
 4564 10353 15104  8711  1219 29457 
Class: numeric 
# Create categorical variable with labels
df_merged <- df_merged |>
  dplyr::mutate(
    childhood_SES_cat = base::factor(
      dplyr::case_when(
        childhood_SES == 1 ~ "Poor",
        childhood_SES == 2 ~ "Below average but not poor",
        childhood_SES == 3 ~ "Around average",
        childhood_SES == 4 ~ "Above average but not wealthy",
        childhood_SES == 5 ~ "Wealthy",
        TRUE ~ NA_character_
    ),
    levels = c(
      "Poor",
      "Below average but not poor",
      "Around average",
      "Above average but not wealthy",
      "Wealthy"
    ),
    ordered = TRUE)) |>
  dplyr::relocate(childhood_SES_cat, .after = childhood_SES)

# Sanity check: View the distribution of the new variable
base::table(df_merged$childhood_SES_cat, useNA = "always")

                         Poor    Below average but not poor                Around average Above average but not wealthy                       Wealthy 
                         4564                         10353                         15104                          8711                          1219 
                         <NA> 
                        29457 

Financial Outlook and Confidence

# Nothing to do here, the item is already numeric and
# within the minimum and maximum values.

table_label(df_merged$fin_outlook)
$fin_outlook
What is your expectation for how things will be for you financially one year from now?
    1     2     3     4     5  <NA> 
 1663  3952 13487 14794  5958 29554 
Class: numeric 
table_label(df_merged$fin_outlook_conf)
$fin_outlook_conf
On a scale from 1 (completely uncertain) to 10 (completely certain), how confident are you in your answer to the last question?
    1     2     3     4     5     6     7     8     9    10  <NA> 
  995   527  1014  1732  4371  4260  6809  8042  4590  7514 29554 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    fin_outlook_cat = base::factor(
      dplyr::case_when(
        fin_outlook == 1 ~ "Things will be much worse",
        fin_outlook == 2 ~ "Things will be somewhat worse",
        fin_outlook == 3 ~ "Things will be about the same",
        fin_outlook == 4 ~ "Things will be somewhat better",
        fin_outlook == 5 ~ "Things will be much better",
        TRUE ~ NA_character_
      ),
      levels = c(
        "Things will be much worse",
        "Things will be somewhat worse",
        "Things will be about the same",
        "Things will be somewhat better",
        "Things will be much better"
      ),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(fin_outlook_cat, .after = fin_outlook)

Attention and Care

# Sanity check: View the counts of each option
table_label(df_merged$attention_care)
$attention_care
There are people that care about and pay attention to what goes on in my life.
    1     2     3     4     5     6     7  <NA> 
  836   838  1949  4431 13507  9044  9214 29589 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    attention_care_cat = base::factor(
      dplyr::case_when(
        attention_care == 1 ~ "Completely disagree",
        attention_care == 2 ~ "Strongly disagree",
        attention_care == 3 ~ "Disagree",
        attention_care == 4 ~ "Neutral",
        attention_care == 5 ~ "Agree",
        attention_care == 6 ~ "Strongly agree",
        attention_care == 7 ~ "Completely agree",
        TRUE ~ NA_character_
    ),
    levels = c(
      "Completely disagree",
      "Strongly disagree",
      "Disagree",
      "Neutral",
      "Agree",
      "Strongly agree",
      "Completely agree"
    ),
    ordered = TRUE)) |>
  dplyr::relocate(attention_care_cat, .after = attention_care)

# Sanity check: View the distribution of the new variable
df_merged |>
  dplyr::count(attention_care, attention_care_cat)
# A tibble: 8 × 3
  attention_care attention_care_cat      n
           <dbl> <ord>               <int>
1              1 Completely disagree   836
2              2 Strongly disagree     838
3              3 Disagree             1949
4              4 Neutral              4431
5              5 Agree               13507
6              6 Strongly agree       9044
7              7 Completely agree     9214
8             NA <NA>                29589

Workplace Arragement

# Sanity check: View the counts of each option
table_label(df_merged$work_arrangement)
$work_arrangement
Which most accurately describes your current work (or study) arrangement?
    1     2     3     4     5  <NA> 
17717  6169  3558  3226  2890 35848 
Class: numeric 
# Create categorical variable with labels
df_merged <- df_merged %>%
  dplyr::mutate(
    work_arrangement_cat = base::factor(
      dplyr::case_when(

        work_arrangement == 1
        ~ "I work entirely in-person (i.e., in an office, on-site)",

        work_arrangement == 2
        ~ "I mostly work in-person, with occasional remote days",

        work_arrangement == 3
        ~ "I work about evenly in-person/remote",

        work_arrangement == 4
        ~ "I mostly work remotely, with occasional in-person days",

        work_arrangement == 5
        ~ "I work entirely remotely",

        TRUE ~ NA_character_
      ),
      levels = c(
        "I work entirely in-person (i.e., in an office, on-site)",
        "I mostly work in-person, with occasional remote days",
        "I work about evenly in-person/remote",
        "I mostly work remotely, with occasional in-person days",
        "I work entirely remotely"
      ),
      ordered = TRUE
    ),

    work_arrangement_cat_nostudents = base::factor(
      dplyr::if_else(
        employment_primary == "Student non-working (Full or part-time)",
        NA_character_,
        as.character(work_arrangement_cat)
      ),
      levels = levels(work_arrangement_cat),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(work_arrangement_cat, 
                  work_arrangement_cat_nostudents,
                  .after = work_arrangement)

# Sanity check: View the distribution of the new variable
df_merged |>
  dplyr::group_by(work_arrangement, work_arrangement_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 6 × 3
# Groups:   work_arrangement [6]
  work_arrangement work_arrangement_cat                                        n
             <dbl> <ord>                                                   <int>
1                1 I work entirely in-person (i.e., in an office, on-site) 17717
2                2 I mostly work in-person, with occasional remote days     6169
3                3 I work about evenly in-person/remote                     3558
4                4 I mostly work remotely, with occasional in-person days   3226
5                5 I work entirely remotely                                 2890
6               NA <NA>                                                    35848
# Sanity check: View the distribution of the new variable excluding students
base::table(df_merged$employment_primary,
            df_merged$work_arrangement_cat_nostudents, useNA = "always")
                                                          
                                                           I work entirely in-person (i.e., in an office, on-site)
  Not in paid employment (by choice/health)                                                                      0
  Not in paid employment (looking for work)                                                                    295
  Student non-working (Full or part-time)                                                                        0
  Employed/working full-time (25+ hours per week)                                                            13053
  Employed/working part-time (less than 25 hours per week)                                                    1898
  Retired                                                                                                        0
  Military service                                                                                             234
  <NA>                                                                                                           0
                                                          
                                                           I mostly work in-person, with occasional remote days I work about evenly in-person/remote
  Not in paid employment (by choice/health)                                                                   0                                    0
  Not in paid employment (looking for work)                                                                 118                                   79
  Student non-working (Full or part-time)                                                                     0                                    0
  Employed/working full-time (25+ hours per week)                                                          4363                                 2251
  Employed/working part-time (less than 25 hours per week)                                                  773                                  581
  Retired                                                                                                     0                                    0
  Military service                                                                                           51                                   48
  <NA>                                                                                                        0                                    0
                                                          
                                                           I mostly work remotely, with occasional in-person days I work entirely remotely  <NA>
  Not in paid employment (by choice/health)                                                                     0                        0  3567
  Not in paid employment (looking for work)                                                                    71                       97  3511
  Student non-working (Full or part-time)                                                                       0                        0  8478
  Employed/working full-time (25+ hours per week)                                                            2051                     1571 10252
  Employed/working part-time (less than 25 hours per week)                                                    586                      632  2061
  Retired                                                                                                       0                        0  2592
  Military service                                                                                             37                       28   559
  <NA>                                                                                                          0                        0  9571

Identification of Sponsored Participants

# Sanity check: View the counts of each option
table_label(df_merged$br)
$br
id
    5  <NA> 
 6445 62963 
Class: numeric 
table_label(df_merged$bs)
$bs
pay
    1  <NA> 
    1 69407 
Class: numeric 
table_label(df_merged$irl)
    0     1  <NA> 
68208  1200     0 
Class: numeric 
# Create a new variable to identify sponsored participants
df_merged <- df_merged |>
  dplyr::mutate(
    sponsored = dplyr::if_else(
      !is.na(br) | !is.na(bs) | irl == 1, 1, 0
    )
  )

# Sanity check
base::table(df_merged$sponsored, useNA = "always")

    0     1  <NA> 
61762  7646     0 

A0.2. Applying exclusion criteria

Direct exclusion criteria

# Identify exclusion criteria and assign status
df_merged <- df_merged |>

  # Create explicit flags for each rule
  dplyr::mutate(
    incomplete = is.na(debts_orig) & irl == 0,

    # E1. Not resident based on manual checking of location validation
    # important to note that the USA version was the default
    # when the survey link was broken or shared without specifying a country
    # in the URL metadata parameters.
    not_resident = loc_resident == 0,

    # E2. Implausible combination of working (3, 4, or 5 on employment)
    # and reporting zero income.
    working_zero_income =
      (stringr::str_detect(employment_orig, "\\b(3|4|5)\\b")) &
      (income_orig == 0 | income_text_clean == 0),

    # E3. Implausible combination of being retired (6 on employment)
    # and having an age <= 25
    retired_young =
      (stringr::str_detect(employment_orig, "\\b6\\b")) &
      (!is.na(age) & age <= 25),

    # E4. Implausible combination of reporting
    # very high MPWB (well-being) and very high PHQ4 (distress)
    extremes_mpwb_phq4 =
      !is.na(gad_worry) &
      (mpwb_sum >= 65 & phq4_sum >= 24),

    # E5. Respondents reporting high MPWB (well-being) and high PHQ-4 (distress),
    # combined with unusually short adjusted completion time.
    high_mpwb_phq4_speed =
      !is.na(gad_worry) &
      !is.na(duration_adj) &
      (mpwb_sum >= 64 & phq4_sum >= 23 & duration_adj < 10),

    # E6. Too-fast based on raw duration,
    # except sponsored participants from Ireland (who don't have duration data)
    too_fast_raw = duration_sec < 150 & irl == 0,

    # E7. We observed a China-specific pattern of
    # unusually fast completion times and low response variance.
    china_too_fast_low_var =
      country == "China" &
      duration_adj < 10 &
      mpwb_var < 1
  ) |>

  # Assign status based on the ordered exclusion criteria
  # (first match is assigned and the rest ignored)
  dplyr::mutate(
    valid_status = base::factor(dplyr::case_when(
      incomplete ~ "incomplete",
      not_resident ~ "not residents",
      working_zero_income ~ "implausible working with no income",
      retired_young ~ "implausible retired young",
      extremes_mpwb_phq4 ~ "implausible extremes",
      high_mpwb_phq4_speed ~ "implausible high scores with speed",
      too_fast_raw ~ "too fast general",
      china_too_fast_low_var ~ "too fast low var",
      TRUE ~ "passed"
    ),
    levels = c(
      "incomplete",
      "not residents",
      "implausible working with no income",
      "implausible retired young",
      "implausible extremes",
      "implausible high scores with speed",
      "too fast general",
      "too fast low var",
      "passed")
    )
  )

# Sanity checks: Overall counts per status
base::table(df_merged$valid_status, useNA = "always")

                        incomplete                      not residents implausible working with no income          implausible retired young 
                             12181                                705                                271                                 39 
              implausible extremes implausible high scores with speed                   too fast general                   too fast low var 
                                48                                  7                               1595                                737 
                            passed                               <NA> 
                             53825                                  0 
# Sanity checks: Check counts for incomplete
df_merged |>
  dplyr::filter(incomplete) |>
  dplyr::group_by(valid_status, Finished, debts_orig, phq_interest) |>
  dplyr::summarise(max_progress = max(Progress), n_incomplete = dplyr::n())
# A tibble: 1 × 6
# Groups:   valid_status, Finished, debts_orig [1]
  valid_status Finished debts_orig phq_interest max_progress n_incomplete
  <fct>           <dbl> <chr>             <dbl>        <dbl>        <int>
1 incomplete          0 <NA>                 NA           77        12181
# Sanity checks: Check counts for not residents
df_merged |>
  dplyr::filter(not_resident & !incomplete) |>
  dplyr::group_by(valid_status, country, loc_country) |>
  dplyr::summarise(n_not_resident = dplyr::n()) |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity checks: Check counts for participants working with zero income
df_merged |>
  dplyr::filter(working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, employment_cat, income_orig, income_text_orig) |>
  dplyr::summarise(n_working_zero_income = dplyr::n()) |>
  dplyr::arrange(-n_working_zero_income)
# A tibble: 15 × 5
# Groups:   valid_status, employment_cat, income_orig [15]
   valid_status                       employment_cat                                                income_orig income_text_orig n_working_zero_income
   <fct>                              <chr>                                                               <dbl> <chr>                            <int>
 1 implausible working with no income Employed/working full-time (25+ hours per week)                         0 <NA>                               119
 2 implausible working with no income Employed/working part-time (less than 25 hours per week)                0 <NA>                                45
 3 implausible working with no income Employed/working full-time (25+ hours per week)                        10 0                                   20
 4 implausible working with no income Full-time student; Employed/working part-time (less than 25 …           0 <NA>                                20
 5 implausible working with no income Full-time student; Employed/working full-time (25+ hours per…           0 <NA>                                14
 6 implausible working with no income Part-time student; Employed/working part-time (less than 25 …           0 <NA>                                13
 7 implausible working with no income Part-time student; Employed/working full-time (25+ hours per…           0 <NA>                                12
 8 implausible working with no income Military service                                                        0 <NA>                                10
 9 implausible working with no income Full-time student; Military service                                     0 <NA>                                 6
10 implausible working with no income Employed/working part-time (less than 25 hours per week)               10 0                                    3
11 implausible working with no income Part-time student; Military service                                     0 <NA>                                 3
12 implausible working with no income Employed/working full-time (25+ hours per week); Military se…           0 <NA>                                 2
13 implausible working with no income Military service                                                       10 0                                    2
14 implausible working with no income Full-time student; Employed/working full-time (25+ hours per…          10 0                                    1
15 implausible working with no income Part-time student; Employed/working part-time (less than 25 …          10 0                                    1
# Sanity checks: Check counts for retired young participants
df_merged |>
  dplyr::filter(retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, employment_orig, age_group) |>
  dplyr::summarise(n_retired_young = dplyr::n()) |>
  dplyr::arrange(-n_retired_young)
# A tibble: 4 × 4
# Groups:   valid_status, employment_orig [4]
  valid_status              employment_orig age_group n_retired_young
  <fct>                     <chr>           <fct>               <int>
1 implausible retired young 6               18-25                  27
2 implausible retired young 1,6             18-25                   6
3 implausible retired young 2,6             18-25                   2
4 implausible retired young 3,6             18-25                   2
# Sanity checks: Check counts for extremes in MPWB and PHQ4
df_merged |>
  dplyr::filter(extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, mpwb_sum, phq4_sum) |>
  dplyr::summarise(n_extremes_mpwb_phq4 = dplyr::n()) |>
  dplyr::arrange(-n_extremes_mpwb_phq4)
# A tibble: 6 × 4
# Groups:   valid_status, mpwb_sum [3]
  valid_status         mpwb_sum phq4_sum n_extremes_mpwb_phq4
  <fct>                   <dbl>    <dbl>                <int>
1 implausible extremes       70       28                   12
2 implausible extremes       69       28                    2
3 implausible extremes       67       25                    1
4 implausible extremes       69       24                    1
5 implausible extremes       70       25                    1
6 implausible extremes       70       26                    1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
  dplyr::filter(high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, mpwb_sum, phq4_sum, duration_adj) |>
  dplyr::summarise(n_high_mpwb_phq4_speed = dplyr::n()) |>
  dplyr::arrange(-n_high_mpwb_phq4_speed)
# A tibble: 3 × 5
# Groups:   valid_status, mpwb_sum, phq4_sum [3]
  valid_status                       mpwb_sum phq4_sum duration_adj n_high_mpwb_phq4_speed
  <fct>                                 <dbl>    <dbl>        <dbl>                  <int>
1 implausible high scores with speed       64       23         7.5                       1
2 implausible high scores with speed       64       28         5.32                      1
3 implausible high scores with speed       66       23         6.04                      1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
  dplyr::filter(too_fast_raw & !high_mpwb_phq4_speed &
                  !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status) |>
  dplyr::summarise(
    min(duration_sec), max(duration_sec), n_too_fast_raw = dplyr::n())
# A tibble: 1 × 4
  valid_status     `min(duration_sec)` `max(duration_sec)` n_too_fast_raw
  <fct>                          <dbl>               <dbl>          <int>
1 too fast general                  48                 149            718
# Sanity checks: Check counts for China-specific exclusion
df_merged |>
  dplyr::filter(china_too_fast_low_var & !too_fast_raw &
                  !high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, country) |>
  dplyr::summarise(
    min(mpwb_var), max(mpwb_var),
    min(duration_adj), max(duration_adj), n_china_too_fast_low_var = dplyr::n())
# A tibble: 1 × 7
# Groups:   valid_status [1]
  valid_status     country `min(mpwb_var)` `max(mpwb_var)` `min(duration_adj)` `max(duration_adj)` n_china_too_fast_low_var
  <fct>            <chr>             <dbl>           <dbl>               <dbl>               <dbl>                    <int>
1 too fast low var China                 0           0.989                   5                9.95                      439
# Clean data to only include "passed" participants
df_clean <- df_merged |>
  dplyr::filter(valid_status == "passed")

Assessments to the Financial variables

Collaborators reviewed the financial variables and created flags indicating whether the responses were valid or not. Basic demographic information about the participants was given only upon request to assist with the review. The income, assets, and debts values that fell within the first income bracket and the last bracket were considered valid by default.

In Zimbabwe, all values were sent for review because there was a concern that participants reported values in Zimbabwean dollar instead of USD as collaborators used in the translation. Also, 14 participants from USA with a value equal to the first income bracket should have been accepted automatically but were sent for review by mistake.

Collaborators were also asked to provide a minimum cut-off for each variable. When the minimum cut-off was higher than the first income bracket, their sheet was updated with the values between the first income bracket and the minimum cut-off. Values of 0 in either financial variable were automatically accepted as is and were not given to collaborators for revision. The values that contained NA, “,” or “.” were also requested for review in order to validate our cleaning script.

The countries where sociodemographic information were provided were: Albania, Bangladesh, Finland, Georgia, Japan, Latvia, Lebanon, Oman, Peru, Portugal, Qatar, Russia, Singapore, Switzerland, Timor-Leste, Ukraine, USA, and Zimbabwe.

This assessment was not conducted for the sponsored participants from Ireland, as they did not provide open field answers regarding income, and were not asked to report assets and debts.

# A manual revision of the values was conducted before the sheet was given to
# collaborators.

df_clean <- df_clean |>
  dplyr::mutate(
    fin_valid_aut_income =
      dplyr::case_when(

      # For participants that selected a decile
      # instead of providing an open text answer, consider them accepted
      income_orig < 10 ~ 1,

      # Values of 0 are automatically accepted as is.
      income_text_clean == 0 ~ 1,

      # If value contains "," or "." or other non-digit, consider them not accepted,
      # so collaborators can review them.
      !(stringr::str_detect(income_text_orig, "^[0-9]+$")) ~ 0,

      # If we detected a weird number, consider them not accepted.
      income_wrd ~ 0,

      # If value is above 0 but below the income first bracket,
      # consider them not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean > 0 & income_text_clean < income_highpoint_1 ~ 0,

      # For all other participants, execute automatic assessment:
      # The values that were within the income first bracket
      # and the value of the last income bracket were considered not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean >= income_highpoint_1 &
      income_text_clean <= income_lowpoint_9 ~ 1,

      # For values above the last income bracket, consider them not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean > income_lowpoint_9 ~ 0,

      TRUE ~ NA_real_
    ),

    fin_valid_aut_assets =
      dplyr::case_when(

      # Sponsored participants from Ireland are assigned NA
      # because no open text answers were collected from them.
      irl == 1 ~ NA_real_,
      assets_clean == 0 ~ 1,

      !(stringr::str_detect(assets_orig, "^[0-9]+$")) ~ 0,

      assets_wrd ~ 0,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean > 0 & assets_clean < income_highpoint_1) ~ 0,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean != 0 &
      assets_clean >= income_highpoint_1 &
      assets_clean <= income_lowpoint_9) ~ 1,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean != 0 &
      assets_clean > income_lowpoint_9) ~ 0,

      TRUE ~ NA_real_
    ),

    fin_valid_aut_debts =
      dplyr::case_when(
      irl == 1 ~ NA_real_,
      debts_clean == 0 ~ 1,

      !(stringr::str_detect(debts_orig, "^[0-9]+$")) ~ 0,

      debts_wrd ~ 0,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean > 0 & debts_clean < income_highpoint_1) ~ 0,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean != 0 &
      debts_clean >= income_highpoint_1 &
      debts_clean <= income_lowpoint_9) ~ 1,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean != 0 &
      debts_clean > income_lowpoint_9) ~ 0,

      TRUE ~ NA_real_
    )
  )

# Examine if the minimum cut-off provided is higher than the first income bracket.
df_clean <- df_clean |>
  dplyr::mutate(

    income_above_cutoff = income_cutoff_min > income_highpoint_1,
    assets_above_cutoff = assets_cutoff_min > income_highpoint_1,
    debts_above_cutoff = debts_cutoff_min > income_highpoint_1,

    fin_valid_aut_income_update =
      dplyr::case_when(
        income_above_cutoff == FALSE ~ fin_valid_aut_income,

        income_above_cutoff == TRUE &
        income_text_clean >= income_highpoint_1 &
        income_text_clean < income_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_income
    ),

    fin_valid_aut_assets_update =
      dplyr::case_when(
        assets_above_cutoff == FALSE ~ fin_valid_aut_assets,

        assets_above_cutoff == TRUE &
        assets_clean >= income_highpoint_1 &
        assets_clean < assets_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_assets
    ),

    fin_valid_aut_debts_update =
      dplyr::case_when(
        debts_above_cutoff == FALSE ~ fin_valid_aut_debts,

        debts_above_cutoff == TRUE &
        debts_clean >= income_highpoint_1 &
        debts_clean < debts_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_debts
    )
  )

# Sanity check: View the counts of automatic financial validity
base::table(df_clean$fin_valid_aut_income, useNA = "always")

    0     1  <NA> 
 3185 50531   109 
base::table(df_clean$fin_valid_aut_assets, useNA = "always")

    0     1  <NA> 
27728 24897  1200 
base::table(df_clean$fin_valid_aut_debts, useNA = "always")

    0     1  <NA> 
13503 39122  1200 

After we transmitted the values that were not automatically classified to collaborators in each country for review, we received back their assessments. We have extracted automatically the sheet with their assessments, and combined them into a single file.

# Extract sections from Excel files in folder "777_countries_documentation"
files <- list.files(
  path = "777_countries_documentation",
  pattern = "\\.xls[x]?$",
  full.names = TRUE) |>
  purrr::discard(
    # Exclude files named 777_Zambia and 777_Global
    ~stringr::str_detect(basename(.x),"^777_(Zambia|Global)"))

process_sheet <- function(path, sheet_name, start_row, tab_label) {
  sheet_all <- readxl::read_excel(path, sheet = sheet_name, col_names = FALSE)
  section <- sheet_all |> dplyr::slice(start_row:nrow(sheet_all)) |> dplyr::select(1:9)
  # drop header row
  section <- section |> dplyr::slice(-1)
  names(section) <- c(
    "ResponseId",
    "UserLanguage",
    "orig",
    "clean",
    "classification",
    "value",
    "cutoff_max",
    "cutoff_min",
    "notes"
  )
    section <- section |>
      dplyr::mutate(
      file = tools::file_path_sans_ext(basename(path)),
      tab = tab_label
    )
  section
}

assessment_fin <- purrr::map_dfr(files, function(path) {
  d1 <- process_sheet(path, "HOUSEHOLD INCOME", 22, "income")
  d2 <- process_sheet(path, "ASSETS", 9, "assets")
  d3 <- process_sheet(path, "DEBTS", 9, "debts")
  dplyr::bind_rows(d1, d2, d3)
}) |>
  dplyr::rename(
    change = value
  ) |>
  dplyr::mutate(
    clean = base::as.numeric(clean),
    cutoff_max = base::as.numeric(cutoff_max),
    cutoff_min = base::as.numeric(cutoff_min)
  ) |>
  tidyr::pivot_wider(
    id_cols = c("ResponseId", "UserLanguage"),
    names_from = "tab",
    values_from = c(
      "change",
      "classification",
      "cutoff_min",
      "cutoff_max",
      "orig",
      "clean"
    ),
    names_sep = "_"
  )
# Sanity check: View the assessment_fin data frame
dplyr::glimpse(assessment_fin, width = 100)
Rows: 33,245
Columns: 20
$ ResponseId            <chr> "R_2S9d1LQe5gzhMGp", "R_2duaXZQf76tNTnX", "R_8YEzJo4GF1VSJiU", "R_8D…
$ UserLanguage          <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "S…
$ change_income         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_assets         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_debts          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ classification_income <chr> "Not possible/not believable", "Not possible/not believable", "Not p…
$ classification_assets <chr> "OK", "OK", NA, "OK", "OK", "OK", "OK", "OK", "OK", NA, NA, NA, NA, …
$ classification_debts  <chr> "OK", NA, "Cannot determine", NA, "Cannot determine", NA, NA, NA, NA…
$ cutoff_min_income     <dbl> 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000…
$ cutoff_min_assets     <dbl> 1000, 1000, NA, 1000, 1000, 1000, 1000, 1000, 1000, NA, NA, NA, NA, …
$ cutoff_min_debts      <dbl> 1000, NA, 1000, NA, 1000, NA, NA, NA, NA, NA, NA, NA, NA, 1000, NA, …
$ cutoff_max_income     <dbl> 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 25000…
$ cutoff_max_assets     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cutoff_max_debts      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ orig_income           <chr> "100", "410", "500", "600", "700", "800", "900", "1200", "2000", "30…
$ orig_assets           <chr> "10000", "100000", NA, "10000", "5000", "5000", "3000", "500.000", "…
$ orig_debts            <chr> "5000", NA, "500", NA, "250", NA, NA, NA, NA, NA, NA, NA, NA, "30000…
$ clean_income          <dbl> 100, 410, 500, 600, 700, 800, 900, 1200, 2000, 3000, 3840, 5000, 700…
$ clean_assets          <dbl> 10000, 100000, NA, 10000, 5000, 5000, 3000, 500000, 2000, NA, NA, NA…
$ clean_debts           <dbl> 5000, NA, 500, NA, 250, NA, NA, NA, NA, NA, NA, NA, NA, 30000, NA, N…
# Sanity check: Compare the cut-off min values
fin_cut_income <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_income) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_income)) |> dplyr::rename(income_cutoff_min = cutoff_min_income)

df_cut_income <- df_clean |> dplyr::group_by(UserLanguage, income_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(income_cutoff_min) & UserLanguage %in% fin_cut_income$UserLanguage)

dplyr::setequal(fin_cut_income, df_cut_income)
[1] TRUE
fin_cut_assets <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_assets) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_assets)) |> dplyr::rename(assets_cutoff_min = cutoff_min_assets)

df_cut_assets <- df_clean |> dplyr::group_by(UserLanguage, assets_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(assets_cutoff_min) & UserLanguage %in% fin_cut_assets$UserLanguage)

dplyr::setequal(fin_cut_assets, df_cut_assets)
[1] TRUE
fin_cut_debts <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_debts) |> summarise() |> dplyr::filter(!is.na(cutoff_min_debts)) |> dplyr::rename(debts_cutoff_min = cutoff_min_debts)

df_cut_debts <- df_clean |> dplyr::group_by(UserLanguage, debts_cutoff_min) |> dplyr::summarise() |> filter(!is.na(debts_cutoff_min) & UserLanguage %in% fin_cut_debts$UserLanguage)

dplyr::setequal(fin_cut_debts, df_cut_debts)
[1] TRUE
# Sanity check:
# Are there any UserLanguage in assessment_fin that are not in df_merged?
base::setdiff(
  unique(assessment_fin$UserLanguage),
  unique(df_merged$UserLanguage)
)
character(0)
# Sanity check:
# All values between clean_income in assessment_fin
# and income_text_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_income) |> filter(!is.na(clean_income)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, income_text_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_income == income_text_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE   3183
# All values between clean_assets in assessment_fin
# and assets_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_assets) |> filter(!is.na(clean_assets)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, assets_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_assets == assets_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE  27351
# All values between clean_debts in assessment_fin
# and debts_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_debts) |> filter(!is.na(clean_debts)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, debts_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_debts == debts_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE  12860
# Sanity check: Any duplicated ResponseId in assessment_fin?
assessment_fin |>
  dplyr::count(ResponseId) |>
  dplyr::filter(n > 1) |>
  base::nrow()
[1] 0
# Join assessments to main data frame
nrow(df_clean)
[1] 53825
df_clean <- df_clean |>
  dplyr::left_join(
    assessment_fin |> dplyr::select(
      ResponseId,
      classification_income,
      change_income,
      classification_assets,
      change_assets,
      classification_debts,
      change_debts
    ),
    by = c("ResponseId")
  ) |>

  # Apply the changes recommended by collaborators
  dplyr::mutate(
    income_text_reviewed = dplyr::case_when(
      !is.na(classification_income) &
      stringr::str_detect(classification_income, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_income),
      TRUE ~ income_text_clean
    ),
    assets_reviewed = dplyr::case_when(
      !is.na(classification_assets) &
      stringr::str_detect(classification_assets, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_assets),
      TRUE ~ assets_clean
    ),
    debts_reviewed = dplyr::case_when(
      !is.na(classification_debts) &
      stringr::str_detect(classification_debts, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_debts),
      TRUE ~ debts_clean
    )
  )

nrow(df_clean)
[1] 53825
# Sanity checks: View the counts of cells that were automatically approved 
# and were still reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean, 
                          classification_income, fin_valid_aut_income,
                          fin_valid_aut_income_update) |> 
  dplyr::filter(fin_valid_aut_income_update == 1 & !is.na(classification_income) 
                & income_text_clean > 0) |> 
  dplyr::group_by(country, classification_income) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean, 
                          classification_assets, fin_valid_aut_assets,
                          fin_valid_aut_assets_update) |>
  dplyr::filter(fin_valid_aut_assets_update==1 & !is.na(classification_assets) 
                & assets_clean > 0) |> 
  dplyr::group_by(country, classification_assets) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 3 × 3
# Groups:   country [2]
  country  classification_assets     n
  <chr>    <chr>                 <int>
1 USA      Cannot determine          2
2 USA      OK                       94
3 Zimbabwe OK                       82
df_clean |> dplyr::select(ResponseId, country, debts_clean, 
                          classification_debts, fin_valid_aut_debts,
                          fin_valid_aut_debts_update) |>
  dplyr::filter(fin_valid_aut_debts_update== 1 & !is.na(classification_debts) &
                  debts_clean > 0) |> 
  dplyr::group_by(country, classification_debts) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 3 × 3
# Groups:   country [2]
  country  classification_debts                   n
  <chr>    <chr>                              <int>
1 USA      OK                                    14
2 Zimbabwe Change to: [add value on column F]    20
3 Zimbabwe OK                                    58
# Sanity checks: View the counts of cells that were automatically disapproved 
# and were not reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean, 
                          classification_income, fin_valid_aut_income,
                          fin_valid_aut_income_update) |>
  dplyr::filter(fin_valid_aut_income_update == 0 & is.na(classification_income) 
                & income_text_clean > 0) |> 
  dplyr::group_by(country, classification_income) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean, 
                          classification_assets, fin_valid_aut_assets,
                          fin_valid_aut_assets_update) |>
  dplyr::filter(fin_valid_aut_assets_update == 0 & is.na(classification_assets) 
                & assets_clean > 0) |> 
  dplyr::group_by(country, classification_assets) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, debts_clean, classification_debts,
                          fin_valid_aut_debts, fin_valid_aut_debts_update) |>
  dplyr::filter(fin_valid_aut_debts_update == 0 & is.na(classification_debts) 
                & debts_clean > 0) |> 
  dplyr::group_by(country, classification_debts) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
# Create variable where we fit the open field answers into the brackets
find_decile <- function(lang, income_val) {
  # If value is missing, return NA for this row
  if (is.na(income_val)) {
    return(NA_real_)
  }

  # Subset brackets for language
  brackets <- income_gaps[income_gaps$UserLanguage == lang, ]

  # If no brackets available for this language, return NA
  if (nrow(brackets) == 0) {
    return(NA_real_)
  }

  for (j in seq_len(nrow(brackets))) {
    low  <- brackets$income_lowpoint_adj[j]
    high <- brackets$income_highpoint_adj[j]

    # Skip rows with missing low
    if (is.na(low)) {
      next
    }

    # Open-ended bracket: [low, ∞)
    if (is.na(high)) {
      if (income_val >= low) {
        return(base::as.numeric(brackets$income_orig[j]))
      } else {
        next
      }
    }

    # Interval [low, high] inclusive
    if (income_val >= low && income_val <= high) {
      return(base::as.numeric(brackets$income_orig[j]))
    }
  }

  # If higher than all defined brackets, assign 9 by your current rule
  9
}

df_clean <- df_clean |>
  dplyr::mutate(
    income_merg = dplyr::case_when(
      
      is.na(income_orig) ~ NA_real_, 
      
      !is.na(income_orig) & income_orig != 10 ~ income_orig,
      
      income_orig == 10 & is.na(income_text_reviewed) ~ NA_real_,
      
      income_orig == 10 &
      !is.na(income_text_reviewed) & 
      income_text_reviewed == 0 ~ 0,
      
      TRUE ~ purrr::map2_dbl(
        UserLanguage,
        income_text_reviewed,
        find_decile
      )
    ),
    income_merg_cat = base::factor(
      dplyr::case_when(
        income_merg == 0 ~ "No income",
        income_merg == 1 ~ "Second decile",
        income_merg == 2 ~ "Third decile",
        income_merg == 3 ~ "Fourth decile",
        income_merg == 4 ~ "Fifth decile",
        income_merg == 5 ~ "Sixth decile",
        income_merg == 6 ~ "Seventh decile",
        income_merg == 7 ~ "Eighth decile",
        income_merg == 8 ~ "Ninth decile",
        income_merg == 9 ~ "Tenth decile",
        TRUE ~ NA_character_
      ),
      levels = c(
        "No income",
        "Second decile",
        "Third decile",
        "Fourth decile",
        "Fifth decile",
        "Sixth decile",
        "Seventh decile",
        "Eighth decile",
        "Ninth decile",
        "Tenth decile"
      ),
      ordered = TRUE
    ),
    income_merg_group =  base::factor(
      dplyr::case_when(
        income_merg_cat == "No income"
        ~ "No income",
        income_merg_cat %in% c("Second decile", "Third decile", "Fourth decile")
        ~ "Low",
        income_merg_cat %in% c("Fifth decile", "Sixth decile")
        ~ "Mid",
        income_merg_cat %in% c("Seventh decile", "Eighth decile", "Ninth decile")
        ~ "Upper",
        income_merg_cat == "Tenth decile"
        ~ "Wealthiest",
        TRUE ~ NA_character_
      ),
      levels = c("No income", "Low", "Mid", "Upper", "Wealthiest"),
      ordered = TRUE
    )
  )

df_clean <- df_clean |>
  dplyr::left_join(
    income_gaps |>
      dplyr::select(
        UserLanguage,
        income_orig,
        income_lowpoint_adj,
        income_highpoint_adj
      ),
    by = c("UserLanguage", "income_merg" = "income_orig")
  ) |>
  dplyr::mutate(
    income_merg_translated = dplyr::case_when(
      is.na(income_merg) ~ NA_character_,

      income_merg == 0 ~ "0",

      # Closed interval [low, high]
      !is.na(income_lowpoint_adj) &
      !is.na(income_highpoint_adj)
      ~ paste0(
          income_lowpoint_adj,
          " - ",
          income_highpoint_adj
        ),

      # Open upper bound [low, ∞)
      !is.na(income_lowpoint_adj) &
      is.na(income_highpoint_adj)
      ~ paste0(
          income_lowpoint_adj,
          "+"
        ),

      TRUE ~ NA_character_
    )
  )

# Sanity checks: View counts of merged income variable
df_clean |>
  dplyr::filter(
    income_orig == 10,
    !is.na(income_text_reviewed)
  ) |>
  dplyr::group_by(
    UserLanguage,
    income_merg,
    income_merg_translated
  ) |>
  dplyr::summarise(
    min_income_text_reviewed = min(income_text_reviewed, na.rm = TRUE),
    max_income_text_reviewed = max(income_text_reviewed, na.rm = TRUE)
  ) |>
  print_reactable(sorted_col = "UserLanguage", width = 800)
df_clean |> 
  dplyr::group_by(income_orig, income_merg) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::print(n = Inf)
# A tibble: 21 × 3
# Groups:   income_orig [12]
   income_orig income_merg     n
         <dbl>       <dbl> <int>
 1           0           0  1866
 2           1           1  4775
 3           2           2  5754
 4           3           3  5961
 5           4           4  5778
 6           5           5  5195
 7           6           6  4646
 8           7           7  4583
 9           8           8  3585
10           9           9  5347
11          10           0    91
12          10           1  1951
13          10           2   834
14          10           3   717
15          10           4   583
16          10           5   443
17          10           6   356
18          10           7   297
19          10           8   208
20          10           9   746
21          NA          NA   109
# Cleanup
rm(fin_cut_income, fin_cut_debts, fin_cut_assets, find_decile,
   assessment_fin, process_sheet, files, df_cut_assets, df_cut_debts, 
   df_cut_income)

Red flag exclusion

Each flag corresponds to a specific pattern that may indicate low-quality data. This process was only applied to participants who passed the direct exclusion criteria.

# Identified participants with IP addresses known to be associated with botnets.
botnet_ids <-
  readr::read_csv("111_response_ids_botnets.csv", show_col_types = FALSE) |>
  dplyr::pull(ResponseId) |>
  base::trimws(); length(botnet_ids)
[1] 262
# Identified participants with IP addresses massively repeated
# across multiple responses.
massive_rep_ids <-
  readr::read_csv("111_ip_repeated.csv", show_col_types = FALSE) |>
  dplyr::pull(ResponseId) |>
  base::trimws(); length(massive_rep_ids)
[1] 10300
# Start the flagging process.
df_flagged <- df_clean |>
  dplyr::mutate(

    # F1. Household >=4 and zero income
    flag_hh4_zero_income =
      dplyr::if_else(
          irl == 0 &
          household_size >= 4 &
          income_merg == 0,
        1,
        0,
        missing = NA_real_
      ),

    # F2. Any financial items not valid
    flag_fin_invalid =
      dplyr::if_else(
          irl == 0 &
        (
          (!is.na(classification_assets) &
             !(classification_assets %in% c("OK", "Change to: [add value on column F]"))) |
          (!is.na(classification_debts) &
             !(classification_debts %in% c("OK", "Change to: [add value on column F]"))) |
          (!is.na(classification_income) &
             !(classification_income %in% c("OK", "Change to: [add value on column F]")))
        ),
        1,
        0,
        missing = NA_real_
      ),

    # F3. Low variance in MPWB, life satisfaction = 10,
    # and no income or very low education
    flag_ls10_noincome_var =
      dplyr::if_else(
          irl == 0 &
          mpwb_var < 1 &
          life_satisfaction == 10 &
          (education_recoded == 1 |
           income_merg == 0),
        1,
        0,
        missing = NA_real_
      ),

    # F4. Assets and debts are the same value (excluding both zero and NA)
    flag_assets_debts_same =
      dplyr::if_else(
          irl == 0 &
          !is.na(assets_clean) &
          !is.na(debts_clean) &
          assets_clean == debts_clean &
          !(assets_clean == 0 & debts_clean == 0),
        1,
        0,
        missing = NA_real_
      ),

    # F5. Full-time student and lowest education level
    # (Peru participants that selected inclusive education are exempt because
    # they have NA in education_recoded)
    flag_student_lowedu =
      dplyr::if_else(
          !is.na(education_recoded) &
          education_recoded == 1 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b1\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F6. Zero variance in MPWB items
    flag_mpwb_zerovar =
      dplyr::if_else(
        !is.na(mpwb_var) & mpwb_var == 0,
        0.5,
        0,
        missing = NA_real_
      ),

    # F7. Nonsensical sex or ethnicity
    flag_nonsensical_sex_ethn =
      dplyr::if_else(
          irl == 0 &
          (
            (!is.na(sex_text_recoded) & sex_text_recoded == "Cannot determine") |
            (!is.na(ethnicity_specify) & ethnicity_specify == "Cannot determine")
          ),
        1,
        0,
        missing = NA_real_
      ),

    # F8. High MPWB and high PHQ-4
    flag_high_mpwb_phq4 =
      dplyr::if_else(
          irl == 0 &
          !is.na(gad_worry) &
          mpwb_sum >= 60 &
          phq4_sum >= 20 &
          duration_adj < 10,
        1,
        0,
        missing = NA_real_
      ),

    # F9. LS vs mean MPWB mismatch
    flag_ls_vs_mpwb =
      dplyr::case_when(
        base::abs(life_satisfaction - mpwb_mean) > 5 ~ 2,
        base::abs(life_satisfaction - mpwb_mean) > 4 ~ 1,
        TRUE ~ 0
      ),

    # F10. Age >75 and working or studying
    flag_age75_workstudy =
      dplyr::if_else(
          age >= 75 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b(1|2|3|4|5)\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F11. Advanced education and <22 years
    flag_young_advance =
      dplyr::if_else(
          !is.na(education_recoded) &
          education_recoded == 5 &
          age < 22,
        1,
        0,
        missing = NA_real_
      ),

    # F12. Short duration and low MPWB variance
    flag_duration_var =
      dplyr::if_else(
          irl == 0 &
          duration_adj < 10 &
          mpwb_var < 1,
        1,
        0,
        missing = NA_real_
      ),

    # F13. Independent, <20, and richest income
    flag_young_rich_alone =
      dplyr::if_else(
          household_size == 1 &
          age < 20 &
          income_merg >= 8,
        1,
        0,
        missing = NA_real_
      ),

    # F14. Retired and working at the same time
    flag_retired_working =
      dplyr::if_else(
          irl == 0 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b6\\b") &
          stringr::str_detect(employment_orig, "\\b(3|4|5)\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F15. Strange numbers in financial variables
    # If collaborators already marked the value as not OK,
    # then there is no need to repeat this flag.
    flag_weird_nr =
      dplyr::case_when(
          irl == 0 &
          (
            (assets_wrd & classification_assets == "OK") |
            (debts_wrd & classification_debts == "OK") |
            (income_wrd & classification_income == "OK")
          )
          ~ 1,
        TRUE ~ 0
      ),

    # F16. Botnet ResponseIds
    flag_botnet =
      dplyr::if_else(
          ResponseId %in% botnet_ids,
        1,
        0,
        missing = NA_real_
      ),

    # F17. Massive repetition of IP + short duration + low variance
    flag_rep =
      dplyr::if_else(
          ResponseId %in% massive_rep_ids &
          duration_adj < 10 &
          mpwb_var < 1,
        1,
        0,
        missing = NA_real_
      ),

    # Total flags
    flag_total =
      base::rowSums(
        dplyr::across(dplyr::starts_with("flag_")),
        na.rm = TRUE
      ),

    # Exclusion flag
    exclusion_flags =
      dplyr::if_else(
        flag_total > 4,
        1,
        0
      ),

    # Update valid_status
    valid_status = base::as.character(valid_status),
    valid_status =
      base::factor(
        dplyr::case_when(
          exclusion_flags == 1 ~ "flagged",
          TRUE ~ valid_status
        ),
        levels = c(
          "flagged",
          "passed"
        )
      )
  )

# Sanity Check: View the counts of exclusion flags
table(df_flagged$valid_status, df_flagged$exclusion_flags, useNA = "always")
         
              0     1  <NA>
  flagged     0    26     0
  passed  53799     0     0
  <NA>        0     0     0
# Sanity Check: View the counts and percentages of each flag
df_flagged |>
  dplyr::select(dplyr::starts_with("flag_")) |>
  dplyr::summarise(
    dplyr::across(
      dplyr::everything(),
      ~ sum((!is.na(.) & . != 0))
    )
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::everything(),
    names_to = "flag",
    values_to = "n_flagged"
  ) |>
  dplyr::mutate(
    percent_flagged = 100 * n_flagged / nrow(df_flagged)
  ) |>
  dplyr::arrange(dplyr::desc(percent_flagged))
# A tibble: 18 × 3
   flag                      n_flagged percent_flagged
   <chr>                         <int>           <dbl>
 1 flag_total                    16396         30.5   
 2 flag_duration_var              6935         12.9   
 3 flag_fin_invalid               4366          8.11  
 4 flag_ls_vs_mpwb                3146          5.84  
 5 flag_mpwb_zerovar              1804          3.35  
 6 flag_nonsensical_sex_ethn      1052          1.95  
 7 flag_assets_debts_same          703          1.31  
 8 flag_rep                        621          1.15  
 9 flag_hh4_zero_income            493          0.916 
10 flag_retired_working            337          0.626 
11 flag_ls10_noincome_var          215          0.399 
12 flag_botnet                     202          0.375 
13 flag_student_lowedu             162          0.301 
14 flag_weird_nr                   124          0.230 
15 flag_age75_workstudy             70          0.130 
16 flag_young_advance               50          0.0929
17 flag_high_mpwb_phq4              29          0.0539
18 flag_young_rich_alone             8          0.0149
# Sanity check: View combinations of classifications where flag_hh4_zero_income is raised
df_flagged |> 
  dplyr::filter(flag_hh4_zero_income == 1) |> 
  dplyr::group_by(household_size, income_merg, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 15 × 4
# Groups:   household_size, income_merg [15]
   household_size income_merg   irl     n
            <dbl>       <dbl> <dbl> <int>
 1              4           0     0   196
 2              5           0     0   120
 3              6           0     0    68
 4              7           0     0    41
 5              8           0     0    20
 6              9           0     0    10
 7             10           0     0    15
 8             11           0     0     1
 9             12           0     0     4
10             13           0     0     3
11             14           0     0     3
12             15           0     0     2
13             17           0     0     1
14             18           0     0     1
15             20           0     0     8
# Sanity check: View combinations of classifications where flag_fin_invalid is raised
df_flagged |> 
  dplyr::filter(flag_fin_invalid == 1) |> 
  dplyr::group_by(classification_income, classification_assets, classification_debts, irl) |> 
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "classification_income", width = 800)
# Sanity check: View combinations of classifications where flag_ls10_noincome_var is raised
df_flagged |> 
  dplyr::filter(flag_ls10_noincome_var == 1) |> 
  dplyr::group_by(mpwb_var, life_satisfaction, education_recoded, income_merg, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "mpwb_var", width = 800)
# Sanity check: View combinations of classifications where flag_assets_debts_same is raised
df_flagged |> 
  dplyr::filter(flag_assets_debts_same == 1) |> 
  dplyr::group_by(assets_clean, debts_clean, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_clean", width = 500)
# Sanity check: View combinations of classifications where flag_student_lowedu is raised
df_flagged |> 
  dplyr::filter(flag_student_lowedu == 1) |> 
  dplyr::group_by(education_recoded, employment_orig) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 8 × 3
# Groups:   education_recoded [1]
  education_recoded employment_orig     n
              <dbl> <chr>           <int>
1                 1 1                 130
2                 1 1,3                 7
3                 1 1,4                 7
4                 1 1,5                 1
5                 1 1,6                 1
6                 1 1,7                 3
7                 1 1,8                 6
8                 1 1,9                 7
# Sanity check: View combinations of classifications where flag_mpwb_zerovar is raised
df_flagged |> 
  dplyr::filter(flag_mpwb_zerovar == 0.5) |> 
  dplyr::group_by(mpwb_var) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  mpwb_var     n
     <dbl> <int>
1        0  1804
# Sanity check: View combinations of classifications where flag_nonsensical_sex_ethn is raised
df_flagged |> 
  dplyr::filter(flag_nonsensical_sex_ethn == 1) |> 
  dplyr::group_by(sex_text_recoded, ethnicity_specify, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 10 × 4
# Groups:   sex_text_recoded, ethnicity_specify [10]
   sex_text_recoded ethnicity_specify                              irl     n
   <chr>            <chr>                                        <dbl> <int>
 1 Cannot determine Cannot determine                                 0    14
 2 Cannot determine Cypriot                                          0     1
 3 Cannot determine Other                                            0     5
 4 Cannot determine Roma                                             0     1
 5 Cannot determine The 4 Rs: (Nkole, Kiga, Batooro and Banyoro)     0     1
 6 Cannot determine White                                            0     1
 7 Cannot determine <NA>                                             0    81
 8 Male             Cannot determine                                 0     2
 9 Non-binary       Cannot determine                                 0     6
10 <NA>             Cannot determine                                 0   940
# Sanity check: View combinations of classifications where flag_high_mpwb_phq4 is raised
df_flagged |> 
  dplyr::filter(flag_high_mpwb_phq4 == 1) |> 
  dplyr::group_by(mpwb_sum, phq4_sum, gad_worry, duration_adj, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  base::print(n = Inf)
# A tibble: 29 × 6
# Groups:   mpwb_sum, phq4_sum, gad_worry, duration_adj [29]
   mpwb_sum phq4_sum gad_worry duration_adj   irl     n
      <dbl>    <dbl>     <dbl>        <dbl> <dbl> <int>
 1       60       20         5         7.66     0     1
 2       60       22         5         6.41     0     1
 3       60       25         6         9.03     0     1
 4       60       28         7         7.66     0     1
 5       61       22         4         9.59     0     1
 6       61       28         7         7.10     0     1
 7       62       20         4         7.8      0     1
 8       62       22         7         8.83     0     1
 9       62       25         5         9.66     0     1
10       62       25         7         6.72     0     1
11       62       26         6         7.79     0     1
12       62       28         7         9.39     0     1
13       62       28         7         9.93     0     1
14       63       20         6         9.17     0     1
15       63       21         4         5.55     0     1
16       63       24         5         6.76     0     1
17       63       24         6         6.59     0     1
18       63       24         6         8.2      0     1
19       63       24         6         9.10     0     1
20       63       25         7         5.90     0     1
21       63       28         7         7.03     0     1
22       64       22         7         8.23     0     1
23       65       20         5         9        0     1
24       65       21         6         5.37     0     1
25       66       21         6         7.83     0     1
26       66       22         6         9.38     0     1
27       70       20         5         6.90     0     1
28       70       20         7         7.93     0     1
29       70       22         4         6.6      0     1
# Sanity check: View combinations of classifications where flag_ls_vs_mpwb is raised
df_flagged |> 
  dplyr::filter(flag_ls_vs_mpwb >= 1) |>
  dplyr::mutate(diff_ls_mpwb = base::abs(life_satisfaction - mpwb_mean)) |>
  dplyr::group_by(life_satisfaction, mpwb_mean, diff_ls_mpwb, flag_ls_vs_mpwb, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "mpwb_mean", width = 500)
# Sanity check: View combinations of classifications where flag_age75_workstudy is raised
df_flagged |> 
  dplyr::filter(flag_age75_workstudy == 1) |>
  dplyr::group_by(employment_cat, age) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_cat", width = 800)
# Sanity check: View combinations of classifications where flag_young_advance is raised
df_flagged |> 
  dplyr::filter(flag_young_advance == 1) |>
  dplyr::group_by(education_recoded_cat, age) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 4 × 3
# Groups:   education_recoded_cat [1]
  education_recoded_cat   age     n
  <ord>                 <dbl> <int>
1 Advanced                 18     5
2 Advanced                 19    13
3 Advanced                 20    12
4 Advanced                 21    20
# Sanity check: View combinations of classifications where flag_duration_var is raised
df_flagged |> 
  dplyr::filter(flag_duration_var == 1) |>
  dplyr::mutate(min_duration_adj = min(duration_adj), 
                max_duration_adj = max(duration_adj),
                min_mpwb_var = min(mpwb_var),
                max_mpwb_var = max(mpwb_var)) |>
  dplyr::group_by(min_duration_adj, max_duration_adj, 
                  min_mpwb_var, max_mpwb_var, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 6
# Groups:   min_duration_adj, max_duration_adj, min_mpwb_var, max_mpwb_var [1]
  min_duration_adj max_duration_adj min_mpwb_var max_mpwb_var   irl     n
             <dbl>            <dbl>        <dbl>        <dbl> <dbl> <int>
1                5             9.97            0        0.989     0  6935
# Sanity check: View combinations of classifications where flag_young_rich_alone is raised
df_flagged |> 
  dplyr::filter(flag_young_rich_alone == 1) |>
  dplyr::group_by(household_size, age, income_merg) |> 
  summarise(n = dplyr::n())
# A tibble: 4 × 4
# Groups:   household_size, age [2]
  household_size   age income_merg     n
           <dbl> <dbl>       <dbl> <int>
1              1    18           8     2
2              1    18           9     1
3              1    19           8     3
4              1    19           9     2
# Sanity check: View combinations of classifications where flag_retired_working is raised
df_flagged |> 
  dplyr::filter(flag_retired_working == 1) |>
  dplyr::group_by(employment_orig, age, irl) |> 
  summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_orig", width = 500)
# Sanity check: View combinations of classifications where flag_weird_nr is raised
df_flagged |> 
  dplyr::filter(flag_weird_nr == 1) |>
  dplyr::group_by(assets_wrd, classification_assets, 
                  debts_wrd, classification_debts, 
                  income_wrd, classification_income, irl) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::print(n = Inf)
# A tibble: 25 × 8
# Groups:   assets_wrd, classification_assets, debts_wrd, classification_debts, income_wrd, classification_income [25]
   assets_wrd classification_assets       debts_wrd classification_debts        income_wrd classification_income                irl     n
   <lgl>      <chr>                       <lgl>     <chr>                       <lgl>      <chr>                              <dbl> <int>
 1 FALSE      OK                          FALSE     <NA>                        TRUE       OK                                     0    16
 2 FALSE      OK                          TRUE      OK                          FALSE      OK                                     0     2
 3 FALSE      OK                          TRUE      OK                          FALSE      Very unlikely to be true               0     2
 4 FALSE      OK                          TRUE      OK                          FALSE      <NA>                                   0    19
 5 FALSE      OK                          TRUE      <NA>                        TRUE       OK                                     0     1
 6 FALSE      Very unlikely to be true    FALSE     <NA>                        TRUE       OK                                     0     1
 7 FALSE      Very unlikely to be true    TRUE      OK                          FALSE      <NA>                                   0     2
 8 FALSE      <NA>                        FALSE     OK                          TRUE       OK                                     0     3
 9 FALSE      <NA>                        FALSE     <NA>                        TRUE       OK                                     0     6
10 FALSE      <NA>                        TRUE      OK                          FALSE      OK                                     0     1
11 FALSE      <NA>                        TRUE      OK                          FALSE      <NA>                                   0    13
12 TRUE       Not possible/not believable TRUE      Not possible/not believable TRUE       OK                                     0     1
13 TRUE       OK                          FALSE     Not possible/not believable FALSE      <NA>                                   0     3
14 TRUE       OK                          FALSE     OK                          FALSE      Change to: [add value on column F]     0     1
15 TRUE       OK                          FALSE     OK                          FALSE      OK                                     0     1
16 TRUE       OK                          FALSE     OK                          FALSE      <NA>                                   0     6
17 TRUE       OK                          FALSE     Very unlikely to be true    FALSE      <NA>                                   0     7
18 TRUE       OK                          FALSE     <NA>                        FALSE      Change to: [add value on column F]     0     1
19 TRUE       OK                          FALSE     <NA>                        FALSE      OK                                     0     1
20 TRUE       OK                          FALSE     <NA>                        FALSE      Very unlikely to be true               0     1
21 TRUE       OK                          FALSE     <NA>                        FALSE      <NA>                                   0    31
22 TRUE       OK                          TRUE      Cannot determine            FALSE      <NA>                                   0     1
23 TRUE       OK                          TRUE      OK                          FALSE      <NA>                                   0     1
24 TRUE       OK                          TRUE      Very unlikely to be true    FALSE      Very unlikely to be true               0     1
25 TRUE       Very unlikely to be true    TRUE      OK                          FALSE      <NA>                                   0     2
# Apply reviewed financial variables, unless all three flags F4, F15, and F2 are raised
df_flagged <- df_flagged |>
  dplyr::mutate(
    assets_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_assets) &
        !classification_assets %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ assets_reviewed
    ),
    debts_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_debts) &
        !classification_debts %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ debts_reviewed
    ),
    income_text_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ income_text_reviewed
    ),
    income_merg = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ income_merg
    ),
    income_merg_translated = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_translated
    ),
    income_merg_group = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_group
    ),
    income_merg_cat = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_cat
    ),
    
    income_merg_cat = base::factor(
      income_merg_cat,
      levels = c(
        "No income",
        "Second decile",
        "Third decile",
        "Fourth decile",
        "Fifth decile",
        "Sixth decile",
        "Seventh decile",
        "Eighth decile",
        "Ninth decile",
        "Tenth decile"
      ),
      ordered = TRUE
    ),
    
    income_merg_group = base::factor(
      income_merg_group,
      levels = c(
        "No income",
        "Low",
        "Mid",
        "Upper",
        "Wealthiest"
      ),
      ordered = TRUE
    )
  )

# Sanity check:
df_flagged |> 
  dplyr::filter(is.na(income_text_reviewed) & !is.na(income_merg) & income_orig == 10) |>
  base::nrow()
[1] 0
df_flagged |> 
  dplyr::filter(
      is.na(income_text_reviewed) &
      income_orig ==10 & 
      classification_income %in% c("OK", "Change to: [remove value]")) |> 
  base::nrow()
[1] 0
df_flagged |> 
  dplyr::filter(
      is.na(income_text_reviewed) &
      income_orig == 10 & 
      is.na(classification_income)) |> 
  base::nrow()
[1] 0
# Create final data frame with only "passed" participants after flagging
df_final <- df_flagged |> 
  dplyr::filter(valid_status == "passed") |>
  dplyr::select(-valid_status, -dplyr::starts_with("flag_"))

# Cleanup
rm(botnet_ids, massive_rep_ids, income_gaps, income_info)

Exclusion Summary

# Combine direct exclusions and flags
df_exclusion <- df_merged |>
  dplyr::left_join(
    df_flagged |>
      dplyr::select(ResponseId, exclusion_flags),
    by = "ResponseId"
  ) |>
  dplyr::mutate(
    exclusion_criteria = base::factor(
      dplyr::case_when(
      valid_status %in% c("incomplete","not residents") ~ valid_status,
      valid_status %in% c(
        "implausible working with no income",
        "implausible retired young",
        "implausible extremes",
        "implausible high scores with speed"
      ) ~ "implausible",
      valid_status %in% c("too fast general","too fast low var") ~ "too fast",
      valid_status == "passed" & 
        !is.na(exclusion_flags) & exclusion_flags == 1 ~ "flagged",
      
      valid_status == "passed" & 
        (is.na(exclusion_flags) | exclusion_flags == 0) ~ "valid",
      
      TRUE ~ NA_character_
    ),
      levels = c(
        "valid",
        "incomplete",
        "too fast",
        "not residents",
        "implausible",
        "flagged"
      )
    )
  )

# Country-level summary
summary_table <- df_exclusion |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    initial_number_of_participants = dplyr::n(),
    valid_participants = base::sum(exclusion_criteria == "valid", na.rm = TRUE),
    incomplete = base::sum(exclusion_criteria == "incomplete", na.rm = TRUE),
    too_fast = base::sum(exclusion_criteria == "too fast", na.rm = TRUE),
    not_residents = base::sum(exclusion_criteria == "not residents", na.rm = TRUE),
    implausible = base::sum(exclusion_criteria == "implausible", na.rm = TRUE),
    flagged = base::sum(exclusion_criteria == "flagged", na.rm = TRUE)
  ) |>
  dplyr::mutate(
    total_exclusions =
      initial_number_of_participants - valid_participants,
    total_pct_lost =
      (total_exclusions / initial_number_of_participants) * 100
  )

# Total row
total_row <- summary_table |>
  dplyr::summarise(
    country = "Total",
    initial_number_of_participants =
      base::sum(initial_number_of_participants),
    valid_participants = base::sum(valid_participants),
    incomplete = base::sum(incomplete),
    too_fast = base::sum(too_fast),
    not_residents = base::sum(not_residents),
    implausible = base::sum(implausible),
    flagged = base::sum(flagged),
    total_exclusions = base::sum(total_exclusions),
    total_pct_lost =
      (total_exclusions / initial_number_of_participants) * 100
  )

summary_table_pct <- dplyr::bind_rows(summary_table, total_row) |>
  dplyr::mutate(
    incomplete = paste0(
      incomplete,
      " (",
      format(round((incomplete / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    too_fast = paste0(
      too_fast,
      " (",
      format(round((too_fast / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    not_residents = paste0(
      not_residents,
      " (",
      format(round((not_residents / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    implausible = paste0(
      implausible,
      " (",
      format(round((implausible / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    flagged = paste0(
      flagged,
      " (",
      format(round((flagged / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    total_exclusions = paste0(
      total_exclusions,
      " (",
      format(round(total_pct_lost, 2), nsmall = 1),
      "%)"
    )
  ) |>
  dplyr::select(-total_pct_lost)

# gt
summary_table_pdf <- summary_table_pct |>
  dplyr::rename(
    Country = country,
    "Initial number of participants" = initial_number_of_participants,
    "Valid participants" = valid_participants,
    "Incomplete" = incomplete,
    "Too fast" = too_fast,
    "Not residents" = not_residents,
    "Implausible combinations" = implausible,
    "Flagged" = flagged,
    "Total exclusions" = total_exclusions
  )

gt_table <- summary_table_pdf |>
  gt::gt() |>
  gt::cols_width(
    Country ~ gt::px(65),
    `Initial number of participants` ~ gt::px(70),
    `Valid participants` ~ gt::px(70),
    Incomplete ~ gt::px(70),
    `Too fast` ~ gt::px(70),
    `Not residents` ~ gt::px(70),
    `Implausible combinations` ~ gt::px(70),
    `Flagged` ~ gt::px(70),
    `Total exclusions` ~ gt::px(105)
  ) |>
  gt::tab_options(
    table.font.size = 10,
    column_labels.font.size = 11,
    table.background.color = "white",
    table.align = "center",
    table.width = gt::px(650),
    table.border.top.color = "white",
    table.border.bottom.color = "white",
    table.border.left.color = "white",
    table.border.right.color = "white",
    table_body.hlines.color = "black",
    table_body.vlines.color = "white",
    column_labels.vlines.color = "white",
    column_labels.border.top.color = "white",
    column_labels.border.bottom.color = "black"
  ) |>
  gt::opt_table_lines() |>
  gt::tab_style(
    style = list(
      gt::cell_text(
        weight = "bold",
        align = "center"
      )
    ),
    locations = gt::cells_column_labels(gt::everything())
  ) |>
  gt::tab_style(
    style = gt::cell_text(
      align = "left"
    ),
    locations = gt::cells_body(columns = Country)
  ) |>
  gt::tab_style(
    style = gt::cell_text(
      align = "center"
    ),
    locations = gt::cells_body(
      columns = c(
        `Initial number of participants`,
        `Valid participants`,
        Incomplete,
        `Too fast`,
        `Not residents`,
        `Implausible combinations`,
        `Flagged`,
        `Total exclusions`
      )
    )
  ); gt_table
Country Initial number of participants Valid participants Incomplete Too fast Not residents Implausible combinations Flagged Total exclusions
Albania 2284 1758 487 (21.32%) 4 ( 0.18%) 34 (1.49%) 1 (0.04%) 0 (0.00%) 526 (23.03%)
Algeria 203 149 53 (26.11%) 1 ( 0.49%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 54 (26.60%)
Angola 329 240 68 (20.67%) 15 ( 4.56%) 1 (0.30%) 5 (1.52%) 0 (0.00%) 89 (27.05%)
Argentina 769 634 119 (15.47%) 3 ( 0.39%) 11 (1.43%) 2 (0.26%) 0 (0.00%) 135 (17.56%)
Armenia 334 246 83 (24.85%) 1 ( 0.30%) 1 (0.30%) 3 (0.90%) 0 (0.00%) 88 (26.35%)
Australia 605 500 67 (11.07%) 25 ( 4.13%) 7 (1.16%) 5 (0.83%) 1 (0.17%) 105 (17.36%)
Austria 685 570 106 (15.47%) 8 ( 1.17%) 1 (0.15%) 0 (0.00%) 0 (0.00%) 115 (16.79%)
Bahrain 211 161 48 (22.75%) 1 ( 0.47%) 1 (0.47%) 0 (0.00%) 0 (0.00%) 50 (23.70%)
Bangladesh 536 335 186 (34.70%) 0 ( 0.00%) 0 (0.00%) 14 (2.61%) 1 (0.19%) 201 (37.50%)
Belgium 331 272 42 (12.69%) 14 ( 4.23%) 2 (0.60%) 1 (0.30%) 0 (0.00%) 59 (17.82%)
Bolivia 341 279 59 (17.30%) 0 ( 0.00%) 2 (0.59%) 1 (0.29%) 0 (0.00%) 62 (18.18%)
Bosnia-Herzegovina 642 486 144 (22.43%) 11 ( 1.71%) 1 (0.16%) 0 (0.00%) 0 (0.00%) 156 (24.30%)
Brazil 2094 1809 241 (11.51%) 30 ( 1.43%) 1 (0.05%) 13 (0.62%) 0 (0.00%) 285 (13.61%)
Bulgaria 393 324 67 (17.05%) 1 ( 0.25%) 0 (0.00%) 1 (0.25%) 0 (0.00%) 69 (17.56%)
Canada 874 707 148 (16.93%) 16 ( 1.83%) 1 (0.11%) 2 (0.23%) 0 (0.00%) 167 (19.11%)
Chad 192 115 73 (38.02%) 0 ( 0.00%) 2 (1.04%) 2 (1.04%) 0 (0.00%) 77 (40.10%)
Chile 240 207 30 (12.50%) 3 ( 1.25%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 33 (13.75%)
China 2523 1018 215 ( 8.52%) 1277 (50.61%) 0 (0.00%) 13 (0.52%) 0 (0.00%) 1505 (59.65%)
Croatia 455 349 99 (21.76%) 7 ( 1.54%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 106 (23.30%)
Cyprus 218 173 41 (18.81%) 0 ( 0.00%) 0 (0.00%) 4 (1.83%) 0 (0.00%) 45 (20.64%)
Czech Republic 267 202 63 (23.60%) 1 ( 0.37%) 1 (0.37%) 0 (0.00%) 0 (0.00%) 65 (24.34%)
Denmark 338 283 47 (13.91%) 7 ( 2.07%) 0 (0.00%) 1 (0.30%) 0 (0.00%) 55 (16.27%)
Ecuador 1075 954 94 ( 8.74%) 7 ( 0.65%) 11 (1.02%) 9 (0.84%) 0 (0.00%) 121 (11.26%)
Egypt 869 630 221 (25.43%) 5 ( 0.58%) 9 (1.04%) 4 (0.46%) 0 (0.00%) 239 (27.50%)
Estonia 2402 1903 480 (19.98%) 4 ( 0.17%) 3 (0.12%) 12 (0.50%) 0 (0.00%) 499 (20.77%)
Ethiopia 552 403 141 (25.54%) 1 ( 0.18%) 4 (0.72%) 3 (0.54%) 0 (0.00%) 149 (26.99%)
Finland 275 241 24 ( 8.73%) 7 ( 2.55%) 1 (0.36%) 2 (0.73%) 0 (0.00%) 34 (12.36%)
France 1175 908 153 (13.02%) 90 ( 7.66%) 2 (0.17%) 21 (1.79%) 1 (0.09%) 267 (22.72%)
Georgia 504 371 126 (25.00%) 6 ( 1.19%) 1 (0.20%) 0 (0.00%) 0 (0.00%) 133 (26.39%)
Germany 1008 824 156 (15.48%) 21 ( 2.08%) 1 (0.10%) 5 (0.50%) 1 (0.10%) 184 (18.25%)
Greece 532 444 81 (15.23%) 3 ( 0.56%) 2 (0.38%) 2 (0.38%) 0 (0.00%) 88 (16.54%)
Hong Kong 237 176 45 (18.99%) 6 ( 2.53%) 10 (4.22%) 0 (0.00%) 0 (0.00%) 61 (25.74%)
Hungary 735 555 169 (22.99%) 5 ( 0.68%) 3 (0.41%) 3 (0.41%) 0 (0.00%) 180 (24.49%)
India 1627 1225 315 (19.36%) 26 ( 1.60%) 21 (1.29%) 34 (2.09%) 6 (0.37%) 402 (24.71%)
Indonesia 1501 1223 250 (16.66%) 13 ( 0.87%) 3 (0.20%) 8 (0.53%) 4 (0.27%) 278 (18.52%)
Iran 292 216 75 (25.68%) 0 ( 0.00%) 0 (0.00%) 1 (0.34%) 0 (0.00%) 76 (26.03%)
Ireland 1661 1526 110 ( 6.62%) 10 ( 0.60%) 12 (0.72%) 3 (0.18%) 0 (0.00%) 135 ( 8.13%)
Israel 437 353 75 (17.16%) 8 ( 1.83%) 0 (0.00%) 1 (0.23%) 0 (0.00%) 84 (19.22%)
Italy 566 489 66 (11.66%) 7 ( 1.24%) 1 (0.18%) 3 (0.53%) 0 (0.00%) 77 (13.60%)
Japan 549 431 76 (13.84%) 36 ( 6.56%) 0 (0.00%) 5 (0.91%) 1 (0.18%) 118 (21.49%)
Kazakhstan 787 676 91 (11.56%) 11 ( 1.40%) 8 (1.02%) 1 (0.13%) 0 (0.00%) 111 (14.10%)
Kosovo 1373 994 359 (26.15%) 4 ( 0.29%) 12 (0.87%) 4 (0.29%) 0 (0.00%) 379 (27.60%)
Kuwait 315 241 69 (21.90%) 2 ( 0.63%) 2 (0.63%) 1 (0.32%) 0 (0.00%) 74 (23.49%)
Kyrgyzstan 375 274 74 (19.73%) 22 ( 5.87%) 1 (0.27%) 4 (1.07%) 0 (0.00%) 101 (26.93%)
Latvia 1023 806 206 (20.14%) 6 ( 0.59%) 2 (0.20%) 3 (0.29%) 0 (0.00%) 217 (21.21%)
Lebanon 416 322 84 (20.19%) 0 ( 0.00%) 3 (0.72%) 7 (1.68%) 0 (0.00%) 94 (22.60%)
Madagascar 169 145 22 (13.02%) 0 ( 0.00%) 0 (0.00%) 2 (1.18%) 0 (0.00%) 24 (14.20%)
Malaysia 816 706 99 (12.13%) 2 ( 0.25%) 1 (0.12%) 7 (0.86%) 1 (0.12%) 110 (13.48%)
Mexico 1164 1062 84 ( 7.22%) 10 ( 0.86%) 1 (0.09%) 6 (0.52%) 1 (0.09%) 102 ( 8.76%)
Moldova 511 398 100 (19.57%) 3 ( 0.59%) 4 (0.78%) 5 (0.98%) 1 (0.20%) 113 (22.11%)
Mongolia 367 261 100 (27.25%) 0 ( 0.00%) 6 (1.63%) 0 (0.00%) 0 (0.00%) 106 (28.88%)
Montenegro 358 301 45 (12.57%) 4 ( 1.12%) 7 (1.96%) 1 (0.28%) 0 (0.00%) 57 (15.92%)
Morocco 302 231 61 (20.20%) 3 ( 0.99%) 1 (0.33%) 6 (1.99%) 0 (0.00%) 71 (23.51%)
Mozambique 154 122 31 (20.13%) 0 ( 0.00%) 0 (0.00%) 1 (0.65%) 0 (0.00%) 32 (20.78%)
Netherlands 448 353 79 (17.63%) 14 ( 3.12%) 1 (0.22%) 1 (0.22%) 0 (0.00%) 95 (21.21%)
Nigeria 721 636 75 (10.40%) 0 ( 0.00%) 8 (1.11%) 2 (0.28%) 0 (0.00%) 85 (11.79%)
North Macedonia 268 230 37 (13.81%) 0 ( 0.00%) 1 (0.37%) 0 (0.00%) 0 (0.00%) 38 (14.18%)
Norway 509 408 90 (17.68%) 11 ( 2.16%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 101 (19.84%)
Oman 520 413 100 (19.23%) 1 ( 0.19%) 3 (0.58%) 2 (0.38%) 1 (0.19%) 107 (20.58%)
Pakistan 507 401 94 (18.54%) 2 ( 0.39%) 1 (0.20%) 9 (1.78%) 0 (0.00%) 106 (20.91%)
Paraguay 205 162 42 (20.49%) 1 ( 0.49%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 43 (20.98%)
Peru 1031 868 87 ( 8.44%) 48 ( 4.66%) 4 (0.39%) 23 (2.23%) 1 (0.10%) 163 (15.81%)
Philippines 3556 2636 769 (21.63%) 17 ( 0.48%) 120 (3.37%) 12 (0.34%) 2 (0.06%) 920 (25.87%)
Poland 1288 1024 250 (19.41%) 8 ( 0.62%) 3 (0.23%) 3 (0.23%) 0 (0.00%) 264 (20.50%)
Portugal 579 451 118 (20.38%) 7 ( 1.21%) 0 (0.00%) 3 (0.52%) 0 (0.00%) 128 (22.11%)
Qatar 526 397 113 (21.48%) 5 ( 0.95%) 11 (2.09%) 0 (0.00%) 0 (0.00%) 129 (24.52%)
Republic of Korea 492 425 27 ( 5.49%) 34 ( 6.91%) 1 (0.20%) 5 (1.02%) 0 (0.00%) 67 (13.62%)
Romania 861 676 174 (20.21%) 10 ( 1.16%) 0 (0.00%) 1 (0.12%) 0 (0.00%) 185 (21.49%)
Russia 1322 1168 73 ( 5.52%) 51 ( 3.86%) 17 (1.29%) 12 (0.91%) 1 (0.08%) 154 (11.65%)
Saudi Arabia 296 260 27 ( 9.12%) 6 ( 2.03%) 1 (0.34%) 2 (0.68%) 0 (0.00%) 36 (12.16%)
Senegal 211 142 66 (31.28%) 0 ( 0.00%) 1 (0.47%) 2 (0.95%) 0 (0.00%) 69 (32.70%)
Serbia 420 324 89 (21.19%) 4 ( 0.95%) 3 (0.71%) 0 (0.00%) 0 (0.00%) 96 (22.86%)
Singapore 298 239 19 ( 6.38%) 39 (13.09%) 0 (0.00%) 1 (0.34%) 0 (0.00%) 59 (19.80%)
Slovakia 724 517 196 (27.07%) 3 ( 0.41%) 5 (0.69%) 3 (0.41%) 0 (0.00%) 207 (28.59%)
Slovenia 746 584 154 (20.64%) 5 ( 0.67%) 1 (0.13%) 2 (0.27%) 0 (0.00%) 162 (21.72%)
South Africa 279 233 45 (16.13%) 0 ( 0.00%) 0 (0.00%) 1 (0.36%) 0 (0.00%) 46 (16.49%)
Spain 729 614 104 (14.27%) 6 ( 0.82%) 3 (0.41%) 2 (0.27%) 0 (0.00%) 115 (15.78%)
Sweden 1149 824 266 (23.15%) 53 ( 4.61%) 0 (0.00%) 6 (0.52%) 0 (0.00%) 325 (28.29%)
Switzerland 823 668 139 (16.89%) 14 ( 1.70%) 0 (0.00%) 2 (0.24%) 0 (0.00%) 155 (18.83%)
Taiwan 201 146 36 (17.91%) 17 ( 8.46%) 0 (0.00%) 2 (1.00%) 0 (0.00%) 55 (27.36%)
Thailand 440 375 59 (13.41%) 4 ( 0.91%) 0 (0.00%) 2 (0.45%) 0 (0.00%) 65 (14.77%)
Timor-Leste 277 144 129 (46.57%) 0 ( 0.00%) 1 (0.36%) 3 (1.08%) 0 (0.00%) 133 (48.01%)
Türkiye 682 487 171 (25.07%) 11 ( 1.61%) 12 (1.76%) 1 (0.15%) 0 (0.00%) 195 (28.59%)
UAE 336 228 86 (25.60%) 6 ( 1.79%) 13 (3.87%) 3 (0.89%) 0 (0.00%) 108 (32.14%)
UK 852 671 147 (17.25%) 26 ( 3.05%) 5 (0.59%) 3 (0.35%) 0 (0.00%) 181 (21.24%)
USA 5708 4242 1002 (17.55%) 170 ( 2.98%) 280 (4.91%) 11 (0.19%) 3 (0.05%) 1466 (25.68%)
Uganda 332 242 85 (25.60%) 0 ( 0.00%) 1 (0.30%) 4 (1.20%) 0 (0.00%) 90 (27.11%)
Ukraine 749 654 88 (11.75%) 3 ( 0.40%) 3 (0.40%) 1 (0.13%) 0 (0.00%) 95 (12.68%)
Uruguay 815 566 246 (30.18%) 1 ( 0.12%) 2 (0.25%) 0 (0.00%) 0 (0.00%) 249 (30.55%)
Uzbekistan 662 556 102 (15.41%) 4 ( 0.60%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 106 (16.01%)
Yemen 580 397 170 (29.31%) 1 ( 0.17%) 5 (0.86%) 7 (1.21%) 0 (0.00%) 183 (31.55%)
Zimbabwe 275 210 59 (21.45%) 3 ( 1.09%) 1 (0.36%) 2 (0.73%) 0 (0.00%) 65 (23.64%)
Total 69408 53799 12181 (17.55%) 2332 ( 3.36%) 705 (1.02%) 365 (0.53%) 26 (0.04%) 15609 (22.49%)
gt::gtsave(gt_table, "222_exclusion_table.html")
gt::gtsave(gt_table, "222_exclusion_table.docx")
pagedown::chrome_print(
  "222_exclusion_table.html",
  output = "222_exclusion_table.pdf"
)

# Country-level inclusion and sample sizes
country_inclusion <- df_exclusion |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    initial_number_of_participants = dplyr::n(),
    valid_participants = sum(exclusion_criteria == "valid", na.rm = TRUE),
    inclusion_rate = format(round(100 * valid_participants /
      initial_number_of_participants, 2), nsmall = 2)
  )

# Country with minimum and maximum inclusion rates
country_inclusion |>
  dplyr::slice_min(inclusion_rate, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 China                             2523               1018 40.35         
country_inclusion |>
  dplyr::slice_max(inclusion_rate, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 Ireland                           1661               1526 91.87         
# Countries with smallest and largest valid sample sizes
country_inclusion |>
  dplyr::slice_min(valid_participants, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 Chad                               192                115 59.90         
country_inclusion |>
  dplyr::slice_max(valid_participants, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 USA                               5708               4242 74.32         
# Cleanup
rm(df_exclusion, gt_table, summary_table, summary_table_pct, summary_table_pdf, total_row)

Missing Data

# Focus on the original variables
orig_cols <- base::intersect(names(df_final), names(df_pub))

df_final_orig <- df_final |>
  dplyr::select(all_of(orig_cols))

visdat::vis_miss(df_final_orig, cluster = TRUE, warn_large_data = FALSE)

df_final_orig |>
  dplyr::summarise(
    dplyr::across(
      dplyr::everything(),
      \(x) 100 * mean(is.na(x), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::everything(),
    names_to = "col",
    values_to = "pct_missing"
  ) |>
  print_reactable(sorted_col = "pct_missing", width = 600)
# Cleanup
rm(orig_cols, df_final_orig)

A0.3. Harmonize financial variables

Midpoint of the brackets

The midpoints of the brackets were computed as (low_bracket + high_bracket)/2 except the last bracket needs to be computed differently because it is open-ended. We computed the median ratio between the midpoints of the last bracket and the low point of the last bracket across countries where the low point of the last bracket was lower than the maximum open text answer provided by participants. This median ratio was then used to compute the midpoint of the last bracket in countries where the low point of the last bracket was higher than the maximum open text answer provided by participants.

# Calculate midpoints for all brackets except the last one
income_brackets <- income_recoded |>
  dplyr::mutate(
    income_midpoint = dplyr::case_when(
      income_orig == 9 ~ NA_real_,
      TRUE ~ (income_lowpoint + income_highpoint) / 2
    )
  ) |> 
  dplyr::select(UserLanguage, income_orig, income_midpoint) |>
  dplyr::glimpse(width = 100)
Rows: 1,125
Columns: 3
$ UserLanguage    <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB…
$ income_orig     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7,…
$ income_midpoint <dbl> 60000.0, 160000.0, 250000.0, 355000.0, 515000.0, 725000.0, 1040000.0, 1625…
nrow(df_final)
[1] 53799
df_final <- df_final |>
  dplyr::left_join(income_brackets, by = c("UserLanguage", "income_orig"))

nrow(df_final)
[1] 53799
# Calculate midpoints for the last bracket
midpoints_last <- df_final |>
  # There are no income values from the open text field in Taiwan
  dplyr::filter(country != "Taiwan") |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    # Ireland have different lowpoints for the last bracket
    # in the main dataset and the sponsored dataset.
    max_income_lowpoint = base::max(income_lowpoint_9, na.rm = TRUE),
    max_income_text = base::max(income_text_reviewed, na.rm = TRUE),
    income_midpoint_last =
      base::mean(c(max_income_lowpoint, max_income_text), na.rm = TRUE),
    bracket_higher_than_text =
      !is.na(max_income_text) & max_income_lowpoint > max_income_text,
    ratio = income_midpoint_last/max_income_lowpoint
  ) |> 
  dplyr::glimpse(width = 100)
Rows: 91
Columns: 6
$ country                  <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "Australi…
$ max_income_lowpoint      <dbl> 2000000, 200000, 1450000001, 2400000, 1200001, 250000, 93163, 230…
$ max_income_text          <dbl> 2000000, 400000, 200000000, 40000000, 4000000, 290000, 200000, 55…
$ income_midpoint_last     <dbl> 2000000.0, 300000.0, 825000000.5, 21200000.0, 2600000.5, 270000.0…
$ bracket_higher_than_text <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ ratio                    <dbl> 1.0000000, 1.5000000, 0.5689655, 8.8333333, 2.1666653, 1.0800000,…
ratio_last_bracket <- midpoints_last |>
  dplyr::filter(!bracket_higher_than_text) |>
  dplyr::summarise(median_ratio = stats::median(ratio, na.rm = TRUE)) |>
  dplyr::pull(median_ratio); ratio_last_bracket
[1] 2
nrow(df_final)
[1] 53799
df_final <- df_final |>
  dplyr::left_join(midpoints_last |> dplyr::select(country, bracket_higher_than_text, max_income_lowpoint, income_midpoint_last), by = "country")

nrow(df_final)
[1] 53799
# Update income_midpoint for the last bracket
df_final <-
  df_final |>
  dplyr::mutate(
    income_midpoint = dplyr::case_when(
      income_orig == 9 & !bracket_higher_than_text ~ income_midpoint_last,
      income_orig == 9 &  bracket_higher_than_text ~ max_income_lowpoint * ratio_last_bracket,
      income_orig == 9 & country == "Taiwan" 
       ~ income_lowpoint_9 * ratio_last_bracket,
      TRUE ~ income_midpoint
    )
  )

# Sanity check: View last midpoints
df_final |>
  dplyr::filter(income_orig == 9) |>
  dplyr::group_by(country, income_orig, income_lowpoint_9, income_midpoint) |>
  dplyr::summarise() |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "income_midpoint", width = 800)
# Cleanup
rm(income_recoded, income_brackets, midpoints_last, ratio_last_bracket)

Convert monthly values to annual

We created a variable that combined midpoints from brackets with the open field answers and converted monthly values to annual values.

The survey versions for Bahrain and Pakistan requested for annual income in their native language, but monthly in the English version. If the distribution of responses is similar, we will retain the transformed monthly values.

table(df_final$income_period, useNA = "always")

 annual monthly    <NA> 
  28802   24997       0 
table(df_final$wages_per_year, df_final$income_period, useNA = "always")
      
       annual monthly  <NA>
  12        0   21935     0
  12.5      0     241     0
  13        0    2821     0
  <NA>  28802       0     0
df_final <- df_final |>
  dplyr::mutate(

    income_cont = dplyr::case_when(
      income_orig == 0 ~ 0,
      income_orig == 10 ~ income_text_reviewed,
      !is.na(income_midpoint) ~ income_midpoint,
      TRUE ~ NA_real_
    ),

    income_cont_nozero = dplyr::case_when(
      income_orig == 10 & income_text_reviewed > 0 ~ income_text_reviewed,
      !is.na(income_midpoint) ~ income_midpoint,
      TRUE ~ NA_real_
    ),

    income_annual = dplyr::case_when(
      income_period == "monthly" ~ income_cont_nozero * wages_per_year,
      TRUE ~ income_cont_nozero
    )
  )

df_final |> dplyr::group_by(country, income_orig, income_merg, income_text_reviewed,
                            income_merg_translated, income_cont, income_midpoint,
                            income_cont_nozero, income_annual, income_period) |>
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::arrange(country, income_orig) |>
  print_reactable(sorted_col = "country", width = 900)

Convert 2025 income values to 2024

The inflation2024_factor was calculated with Consumer Price Index (CPI): CPI 2025 / CPI 2024

If national reports only provided the percentage of inflation change, then the factor was calculated as 1 + (percentage inflation change / 100). Then, to convert 2025 income to 2024 values we divided 2025 income by this factor.

table(df_final$income_year, useNA = "always")

 2024  2025  <NA> 
50097  3702     0 
table(df_final$inflation2024_factor, useNA = "always")

 0.966  1.001 1.0021 1.0182 1.0399 1.0415  1.045  1.046  1.073  1.087  1.118   <NA> 
   115    293    241    500    371    122    230    324    274    556    676  50097 
df_final <- df_final |>
  dplyr::mutate(
    income_annual_24 = dplyr::case_when(
      income_year == 2025 ~ income_annual / inflation2024_factor,
      TRUE ~ income_annual
    )
  )

Income net and gross

Collaborators from Kuwait, Oman, and Saudi Arabia confirmed that the income values do not require transformation because there is no income tax in these countries.

The survey version for Belgium requested net income in the Dutch version, and gross income in the French version.

Since the calculation of social contribution and tax deduction is not the same for all countries, we will remove the countries that asked for net income.

We will not apply tax brackets for Zimbabwe, since the system changes throughout the year due to inflation.

df_final <- df_final |>
  dplyr::mutate(
    income_annual_24_gross = dplyr::case_when(
      income_type == "net" ~ NA_real_,
      TRUE ~ income_annual_24
    )
  )

Converting all financial values to USD

df_final <- df_final |>
  dplyr::mutate(
    
    # We have already set values of 0 as NA earlier
    income_annual_24_gross_USD = income_annual_24_gross *
      one_local_unit_to_USD_conversion,

    # If assets or debts are 0, set to NA
    assets_USD = base::ifelse(
      is.na(assets_reviewed) | assets_reviewed == 0,
      NA_real_,
      assets_reviewed * one_local_unit_to_USD_conversion
    ),
    debts_USD = base::ifelse(
      is.na(debts_reviewed) | debts_reviewed == 0,
      NA_real_,
      debts_reviewed * one_local_unit_to_USD_conversion
    )
  )

Calculating z-scores and percentiles for assets and debts

# Sanity check: View means and sds
df_final |>
  dplyr::group_by(country, income_type) |>
  dplyr::summarise(
    n_income = base::round(base::sum(!is.na(income_annual_24_gross_USD)), 2),
    mean_income = base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 2),
    sd_income = base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 2),

    n_assets = base::round(base::sum(!is.na(assets_USD)), 2),
    mean_assets = base::round(base::mean(assets_USD, na.rm = TRUE), 2),
    sd_assets = base::round(stats::sd(assets_USD, na.rm = TRUE), 2),

    n_debts = base::round(base::sum(!is.na(debts_USD)), 2),
    mean_debts = base::round(base::mean(debts_USD, na.rm = TRUE), 2),
    sd_debts = base::round(stats::sd(debts_USD, na.rm = TRUE), 2)
  )  |>
  print_reactable(sorted_col = "country", width = 900)
df_final <- df_final |>
  dplyr::group_by(country) |>
  dplyr::mutate(
    income_USD_z_local = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      (income_annual_24_gross_USD - 
         base::mean(income_annual_24_gross_USD, na.rm = TRUE))
      / stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
      NA_real_
    ),
    
    assets_USD_z_local = base::ifelse(
      !is.na(assets_USD),
      (assets_USD - base::mean(assets_USD, na.rm = TRUE))
      / stats::sd(assets_USD, na.rm = TRUE),
      NA_real_
    ),
    
    debts_USD_z_local = base::ifelse(
      !is.na(debts_USD),
      (debts_USD - base::mean(debts_USD, na.rm = TRUE))
      / stats::sd(debts_USD, na.rm = TRUE),
      NA_real_
    ),
    
    income_USD_percentile_local = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      dplyr::percent_rank(income_annual_24_gross_USD),
      NA_real_
    ),
    
    assets_USD_percentile_local = base::ifelse(
      !is.na(assets_USD),
      dplyr::percent_rank(assets_USD),
      NA_real_
    ),
      
    debts_USD_percentile_local = base::ifelse(
      !is.na(debts_USD),
      dplyr::percent_rank(debts_USD),
      NA_real_
    )
  ) |>
  dplyr::ungroup() |>
  dplyr::mutate(
    income_USD_z_full = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      (income_annual_24_gross_USD - 
         base::mean(income_annual_24_gross_USD, na.rm = TRUE))
      / stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
      NA_real_
    ),
    
    assets_USD_z_full = base::ifelse(
    !is.na(assets_USD),
    (assets_USD - base::mean(assets_USD, na.rm = TRUE)) 
    / stats::sd(assets_USD, na.rm = TRUE),
      NA_real_
    ),
    
    debts_USD_z_full = base::ifelse(
    !is.na(debts_USD),
    (debts_USD  - base::mean(debts_USD, na.rm = TRUE)) 
    / stats::sd(debts_USD, na.rm = TRUE),
      NA_real_
    ),
    
    income_USD_percentile_full = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      dplyr::percent_rank(income_annual_24_gross_USD),
      NA_real_
    ),
    
    assets_USD_percentile_full = base::ifelse(
    !is.na(assets_USD),
    dplyr::percent_rank(assets_USD),
      NA_real_
    ),
    
    debts_USD_percentile_full = base::ifelse(
    !is.na(debts_USD),
    dplyr::percent_rank(debts_USD),
      NA_real_
    )
  )


(df_gmh |> filter(is.na(income_cont)) |> nrow()) + 
  (df_gmh |> filter(income_cont == 0) |> nrow())
[1] 2664
df_gmh |> filter(is.na(income_annual)) |> nrow()
[1] 2664
df_gmh |> filter(is.na(income_annual_24)) |> nrow()
[1] 2664
(df_gmh |> filter(is.na(income_annual_24) & income_type != "net") |> nrow()) + 
  (df_gmh |> filter(income_type == "net") |> nrow())
[1] 13180
df_gmh |> filter(is.na(income_annual_24_gross)) |> nrow()
[1] 13180
df_gmh |> filter(is.na(income_annual_24_gross_USD)) |> nrow()
[1] 13180

A0.4. Weights data

For Moldova, Romania, Nigeria, Montenegro, Angola, Morocco, Uruguay, Paraguay, Greece, Iran, Hungary, Kosovo, Yemen, Chile, and Uganda, values of 1 were used instead of weighted scores.

# Load weights computed based on age, education, sex, and country
weights <- base::readRDS("444_weighted_data.RDS")

# Sanity check: View participants without weights due to missing sociodemographics
weights |>
  dplyr::filter(is.na(ps_weight), !is.na(education_recoded_cat)) |>
  dplyr::select(ResponseId, country, age, sex_binary_cat, education_recoded_cat) |>
  print_reactable(sorted_col = "country", width = 600)
# Merge weights into main data
nrow(df_final)
[1] 53799
df_gmh <- df_final |>
  dplyr::left_join( weights |> dplyr::select(ResponseId, ps_weight), by = "ResponseId")

nrow(df_gmh)
[1] 53799
# For a set of countries, recode the weight score to 1. Also recode NA to 1.
df_gmh <- df_gmh %>%
  mutate(ps_weight = base::ifelse(
    country %in% flagged_countries,
    1,
    ps_weight),
  ps_weight_flag = base::ifelse(
    country %in% flagged_countries,
    1, 0
  ),
  ps_weight_na = base::ifelse(
    country %in% flagged_countries,
    NA_real_, ps_weight
  ),
  ps_weight = base::ifelse(is.na(ps_weight), 1, ps_weight)
)

# Sanity check: How many missing values in weights after transforming those to 1?
df_gmh |>
  dplyr::summarise(
    n_missing_weights = base::sum(is.na(ps_weight_na)),
    perc_missing_weights = (n_missing_weights / dplyr::n()) * 100
  ) |> base::nrow()
[1] 1
# Cleanup
rm(weights)

A0.5. Calculate Factor Scores

Global Factor Scores

fit_mpwb <- lavaan::cfa(
  'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
  data = df_gmh,
  std.lv = TRUE,
  estimator = "MLR",
  sampling.weights = "ps_weight"
)

# We don't have any missing case on any of these variables
# and lavPredict keeps the same row order according to their manual
factor_scores <- lavaan::lavPredict(fit_mpwb, type = "lv")
df_gmh$mpwb_factor_global <- factor_scores[,1]

# View loadings
summary(fit_mpwb, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)
lavaan 0.6-19 ended normally after 15 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        20

  Number of observations                          53799
  Sampling weights variable                   ps_weight

Model Test User Model:
                                               Standard      Scaled
  Test Statistic                              11658.139    4049.674
  Degrees of freedom                                 35          35
  P-value (Chi-square)                            0.000       0.000
  Scaling correction factor                                   2.879
    Yuan-Bentler correction (Mplus variant)                        

Model Test Baseline Model:

  Test statistic                            281635.351   94343.306
  Degrees of freedom                                45          45
  P-value                                        0.000       0.000
  Scaling correction factor                                  2.985

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.959       0.957
  Tucker-Lewis Index (TLI)                       0.947       0.945
                                                                  
  Robust Comparative Fit Index (CFI)                         0.959
  Robust Tucker-Lewis Index (TLI)                            0.947

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)            -840221.377 -840221.377
  Scaling correction factor                                  2.624
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)    -834392.308 -834392.308
  Scaling correction factor                                  2.786
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                             1680482.755 1680482.755
  Bayesian (BIC)                           1680660.615 1680660.615
  Sample-size adjusted Bayesian (SABIC)    1680597.055 1680597.055

Root Mean Square Error of Approximation:

  RMSEA                                          0.079       0.046
  90 Percent confidence interval - lower         0.077       0.045
  90 Percent confidence interval - upper         0.080       0.047
  P-value H_0: RMSEA <= 0.050                    0.000       1.000
  P-value H_0: RMSEA >= 0.080                    0.025       0.000
                                                                  
  Robust RMSEA                                               0.078
  90 Percent confidence interval - lower                     0.076
  90 Percent confidence interval - upper                     0.080
  P-value H_0: Robust RMSEA <= 0.050                         0.000
  P-value H_0: Robust RMSEA >= 0.080                         0.092

Standardized Root Mean Square Residual:

  SRMR                                           0.031       0.031

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  mpwb =~                                                               
    mpwb_competenc    1.064    0.009  124.883    0.000    1.064    0.737
    mpwb_mtnl_stbl    1.123    0.008  139.341    0.000    1.123    0.743
    mpwb_engagemnt    0.773    0.009   85.772    0.000    0.773    0.577
    mpwb_meaning      1.127    0.009  130.748    0.000    1.127    0.757
    mpwb_optimism     1.213    0.008  145.537    0.000    1.213    0.763
    mpwb_postv_mtn    1.202    0.008  155.756    0.000    1.202    0.821
    mpwb_pstv_rltn    0.743    0.010   76.452    0.000    0.743    0.508
    mpwb_resilienc    0.970    0.008  114.688    0.000    0.970    0.654
    mpwb_self_estm    1.196    0.008  151.048    0.000    1.196    0.800
    mpwb_vitality     1.189    0.008  153.797    0.000    1.189    0.762

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .mpwb_competenc    0.951    0.012   80.689    0.000    0.951    0.457
   .mpwb_mtnl_stbl    1.023    0.012   84.746    0.000    1.023    0.448
   .mpwb_engagemnt    1.194    0.013   90.907    0.000    1.194    0.667
   .mpwb_meaning      0.949    0.012   76.064    0.000    0.949    0.428
   .mpwb_optimism     1.059    0.013   79.653    0.000    1.059    0.418
   .mpwb_postv_mtn    0.698    0.010   67.005    0.000    0.698    0.326
   .mpwb_pstv_rltn    1.583    0.017   95.070    0.000    1.583    0.742
   .mpwb_resilienc    1.262    0.013   99.437    0.000    1.262    0.573
   .mpwb_self_estm    0.804    0.011   71.741    0.000    0.804    0.360
   .mpwb_vitality     1.024    0.012   83.341    0.000    1.024    0.420
    mpwb              1.000                               1.000    1.000

R-Square:
                   Estimate
    mpwb_competenc    0.543
    mpwb_mtnl_stbl    0.552
    mpwb_engagemnt    0.333
    mpwb_meaning      0.572
    mpwb_optimism     0.582
    mpwb_postv_mtn    0.674
    mpwb_pstv_rltn    0.258
    mpwb_resilienc    0.427
    mpwb_self_estm    0.640
    mpwb_vitality     0.580
# Correlations (unweighted)
stats::cor(df_gmh$mpwb_factor_global, df_gmh$mpwb_sum) |> round(3)
[1] 0.995
stats::cor(df_gmh$mpwb_factor_global, df_gmh$mpwb_mean) |> round(3)
[1] 0.995
# Correlations (weighted)
weighted_corr(df_gmh, mpwb_factor_global, mpwb_sum)
      r        t     p
1 0.995 22094.64 <.001
weighted_corr(df_gmh, mpwb_factor_global, mpwb_mean)
      r        t     p
1 0.995 21926.65 <.001
# Sanity check: How many missing values in global factor scores?
df_gmh |>
  dplyr::filter(is.na(mpwb_factor_global)) |>
  base::nrow()
[1] 0
# Cleanup
rm(fit_mpwb, factor_scores)

Within Country Factor Scores

# Split data by country
country_list <- base::split(df_gmh, df_gmh$country)

# For each country we will fit CFA and extract scores
country_scores <- lapply(country_list, function(country_data) {

  fit <- lavaan::cfa(
    'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
    data = country_data,
    std.lv = TRUE,
    estimator = "MLR",
    sampling.weights = "ps_weight"
  )

  factor_scores <- lavaan::lavPredict(fit, type = "lv")[, 1]

  country_data$mpwb_factor_within <- factor_scores

  return(country_data)
})

# Recombine all countries
df_gmh <- dplyr::bind_rows(country_scores)

# Sanity check
df_gmh |>
  dplyr::group_by(country) |>
  dplyr::group_modify(~ {
    tibble::tibble(
      n = base::nrow(.x),
      mean_factor_within = base::round(base::mean(.x$mpwb_factor_within, na.rm = TRUE), 2),
      sd_factor_within = base::round(stats::sd(.x$mpwb_factor_within, na.rm = TRUE), 2),
      cor_mpwb_sum = base::round(
        stats::cor(.x$mpwb_factor_within, .x$mpwb_sum, use = "complete.obs"),
        3
      ),
      cor_mpwb_mean = base::round(
        stats::cor(.x$mpwb_factor_within, .x$mpwb_mean, use = "complete.obs"),
        3
      ),
      cor_mpwb_sum_wt = base::round(
        weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]],
        3
      ),
      cor_mpwb_mean_wt = base::round(
        weighted_corr(.x, mpwb_factor_within, mpwb_mean)[[1]],
        3
      )
    )
  }) |>
  dplyr::ungroup() |>
  print_reactable(sorted_col = "country", width = 500)
Error in base::round(weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]], : non-numeric argument to mathematical function
# Sanity check: How many missing values in factor scores?
df_gmh |>
  dplyr::filter(is.na(mpwb_factor_within)) |>
  base::nrow()
[1] 0
# Cleanup
rm(country_list, country_scores)

A0.6 Saving data

# Write labels from codebook to df_gmh
codebook_label <- codebook |>
  dplyr::select(variable, label) |>
  (\(x) { stats::setNames(x$label, x$variable) })()

for (v in names(codebook_label)) {
  labelled::var_label(df_gmh[[v]]) <- codebook_label[[v]]
}

# Save cleaned data
saveRDS(df_gmh, "999_cleaned_data.rds")
write.csv(df_gmh, "999_cleaned_data.csv", row.names = FALSE)

rm(v, codebook_label)

A1. Findings’ Timeline

Show the code
# Sanity check
class(df_gmh$StartDate)
[1] "POSIXct" "POSIXt" 
Show the code
attr(df_gmh$StartDate, "tzone")
[1] "America/New_York"
Show the code
# Only consider main dataset
df_time <- df_gmh |>
  dplyr::mutate(
    StartDate = lubridate::as_date(StartDate)
  ) |>
  dplyr::filter(!is.na(StartDate))

# Daily aggregates of mpwb_sum
daily_sum <- df_time |>
  dplyr::group_by(StartDate) |>
  dplyr::summarise(
    n = dplyr::n(),
    sum_x = base::sum(mpwb_sum),
    sum_x2 = base::sum(mpwb_sum^2)
  )

cum_sum <- daily_sum |>
  dplyr::arrange(StartDate) |>
  dplyr::mutate(
    cum_n = cumsum(n),
    cum_sum = cumsum(sum_x),
    cum_sumsq = cumsum(sum_x2),
    mean = cum_sum / cum_n,
    var = dplyr::if_else(
      cum_n > 1,
      (cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
      NA_real_
    ),
    se = sqrt(var / cum_n),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  )

p_daily_n <- 
  ggplot2::ggplot(daily_sum, ggplot2::aes(x = StartDate, y = n)) +
  ggplot2::geom_col(width = 1, fill = "#11357f") +
  ggplot2::labs(x = NULL, y = "Daily n\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

p_cum_mean <- 
  ggplot2::ggplot(cum_sum, ggplot2::aes(x = StartDate, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::labs(x = "Date", y = "MPWB Sum (Rolling mean)\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

# Daily aggregates per measure
long_dim <- df_time |>
  dplyr::select(StartDate, dplyr::all_of(mpwb_items)) |>
  tidyr::pivot_longer(
    cols = dplyr::all_of(mpwb_items),
    names_to = "measure",
    values_to = "value"
  )

daily_dim <- long_dim |>
  dplyr::group_by(StartDate, measure) |>
  dplyr::summarise(
    n = dplyr::n(),
    sum_x = sum(value),
    sum_x2 = sum(value^2)
  )

cum_dim <- daily_dim |>
  dplyr::group_by(measure) |>
  dplyr::arrange(StartDate, .by_group = TRUE) |>
  dplyr::mutate(
    cum_n = base::cumsum(n),
    cum_sum = base::cumsum(sum_x),
    cum_sumsq = base::cumsum(sum_x2),
    mean = cum_sum / cum_n,
    var = dplyr::if_else(
      cum_n > 1,
      (cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
      NA_real_
    ),
    se = base::sqrt(var / cum_n),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  ) |>
  dplyr::mutate(measure_lab = dplyr::recode(measure, !!!mpwb_labels))

p_dims <- 
  ggplot2::ggplot(cum_dim, ggplot2::aes(x = StartDate, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.8, colour = "#11357f") +
  ggplot2::facet_wrap(~ measure_lab, ncol = 2) +
  ggplot2::labs(x = "Date", y = "Rolling mean") +
  theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25),
  )

# Overall
daily_overall <- df_time |>
  tidyr::pivot_longer(
    cols = mpwb_sum,
    names_to = "measure",
    values_to = "value"
  ) |>
  dplyr::group_by(StartDate, measure) |>
  dplyr::summarise(
    n = dplyr::n(),
    mean_val = base::mean(value, na.rm = TRUE),
    sd_val = stats::sd(value, na.rm = TRUE),
    se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
    lo_val = mean_val - 1.96 * se_val,
    hi_val = mean_val + 1.96 * se_val
  )

range_dates <- base::range(daily_overall$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")

p_overall <- 
  ggplot2::ggplot(daily_overall, ggplot2::aes(x = StartDate, y = mean_val)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::scale_x_date(breaks = breaks_daily,
    labels = daily_overall |>
      dplyr::mutate(
        day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
        mon = base::format(StartDate, "%b"),
        label = if (day == "1") paste(day, mon) else day
      ) |>
      dplyr::pull(label),
    expand = base::c(0.01, 0.01)
  ) +
  ggplot2::labs( x = NULL, y = "MPWB sum") +
  theme_gmh +
  theme(axis.text.x = element_text(
    margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
    ), 
        panel.grid.major.x = ggplot2::element_line(
          color = "#ddeded", linewidth = 0.25
        )
  )
cowplot::plot_grid(
  p_daily_n, p_cum_mean,
  ncol = 1, rel_heights = c(0.35, 0.65), align = "v"
)

p_dims

p_overall

Show the code
# Daily aggregates of ls
daily_sum_ls <- df_time |>
  dplyr::group_by(StartDate) |>
  dplyr::summarise(
    n = dplyr::n(),
    sum_x = base::sum(life_satisfaction),
    sum_x2 = base::sum(life_satisfaction^2)
  )

cum_sum_ls <- daily_sum_ls |>
  dplyr::arrange(StartDate) |>
  dplyr::mutate(
    cum_n = cumsum(n),
    cum_sum = cumsum(sum_x),
    cum_sumsq = cumsum(sum_x2),
    mean = cum_sum / cum_n,
    var = dplyr::if_else(
      cum_n > 1,
      (cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
      NA_real_
    ),
    se = sqrt(var / cum_n),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  )

p_daily_n_ls <- 
  ggplot2::ggplot(daily_sum_ls, ggplot2::aes(x = StartDate, y = n)) +
  ggplot2::geom_col(width = 1, fill = "#11357f") +
  ggplot2::labs(x = NULL, y = "Daily n\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

p_cum_mean_ls <- 
  ggplot2::ggplot(cum_sum_ls, ggplot2::aes(x = StartDate, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::labs(x = "Date", y = "Life satisfaction (Rolling mean)\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

# Overall
daily_overall_ls <- df_time |>
  tidyr::pivot_longer(
    cols = life_satisfaction,
    names_to = "measure",
    values_to = "value"
  ) |>
  dplyr::group_by(StartDate, measure) |>
  dplyr::summarise(
    n = dplyr::n(),
    mean_val = base::mean(value, na.rm = TRUE),
    sd_val = stats::sd(value, na.rm = TRUE),
    se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
    lo_val = mean_val - 1.96 * se_val,
    hi_val = mean_val + 1.96 * se_val
  )

range_dates <- base::range(daily_overall_ls$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")

p_overall_ls <- 
  ggplot2::ggplot(daily_overall_LS, ggplot2::aes(x = StartDate, y = mean_val)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::scale_x_date(breaks = breaks_daily,
    labels = daily_overall |>
      dplyr::mutate(
        day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
        mon = base::format(StartDate, "%b"),
        label = if (day == "1") paste(day, mon) else day
      ) |>
      dplyr::pull(label),
    expand = base::c(0.01, 0.01)
  ) +
  ggplot2::labs( x = NULL, y = "Life satisfaction") +
  theme_gmh +
  theme(axis.text.x = element_text(
    margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
    ), 
        panel.grid.major.x = ggplot2::element_line(
          color = "#ddeded", linewidth = 0.25
        )
  )
Error: object 'daily_overall_LS' not found
cowplot::plot_grid(
  p_daily_n_ls, p_cum_mean_ls,
  ncol = 1, rel_heights = c(0.35, 0.65), align = "v"
)

p_overall_ls
Error: object 'p_overall_ls' not found
Show the code
df_time_phq <- df_gmh |>
  dplyr::mutate(
    StartDate = lubridate::as_date(StartDate)
  ) |>
  dplyr::filter(!is.na(StartDate) & !is.na(gad_worry))

# Daily aggregates of mpwb_sum
daily_sum_phq <- df_time_phq |>
  dplyr::group_by(StartDate) |>
  dplyr::summarise(
    n = dplyr::n(),
    sum_x = base::sum(phq4_sum),
    sum_x2 = base::sum(phq4_sum^2)
  )

cum_sum_phq <- daily_sum_phq |>
  dplyr::arrange(StartDate) |>
  dplyr::mutate(
    cum_n = cumsum(n),
    cum_sum = cumsum(sum_x),
    cum_sumsq = cumsum(sum_x2),
    mean = cum_sum / cum_n,
    var = dplyr::if_else(
      cum_n > 1,
      (cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
      NA_real_
    ),
    se = sqrt(var / cum_n),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  )

p_daily_n_phq <- 
  ggplot2::ggplot(daily_sum_phq, ggplot2::aes(x = StartDate, y = n)) +
  ggplot2::geom_col(width = 1, fill = "#11357f") +
  ggplot2::labs(x = NULL, y = "Daily n\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

p_cum_mean_phq <- 
  ggplot2::ggplot(cum_sum_phq, ggplot2::aes(x = StartDate, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::labs(x = "Date", y = "PHQ-4 (Rolling mean)\n") +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
  )

# Daily aggregates per measure
long_dim_phq <- df_time_phq |>
  dplyr::select(StartDate, dplyr::all_of(phq4_items)) |>
  tidyr::pivot_longer(
    cols = dplyr::all_of(phq4_items),
    names_to = "measure",
    values_to = "value"
  )

daily_dim_phq <- long_dim_phq |>
  dplyr::group_by(StartDate, measure) |>
  dplyr::summarise(
    n = dplyr::n(),
    sum_x = sum(value),
    sum_x2 = sum(value^2)
  )

cum_dim_phq <- daily_dim_phq |>
  dplyr::group_by(measure) |>
  dplyr::arrange(StartDate, .by_group = TRUE) |>
  dplyr::mutate(
    cum_n = base::cumsum(n),
    cum_sum = base::cumsum(sum_x),
    cum_sumsq = base::cumsum(sum_x2),
    mean = cum_sum / cum_n,
    var = dplyr::if_else(
      cum_n > 1,
      (cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
      NA_real_
    ),
    se = base::sqrt(var / cum_n),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  )

p_dims_phq <- 
  ggplot2::ggplot(cum_dim_phq, ggplot2::aes(x = StartDate, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.8, colour = "#11357f") +
  ggplot2::facet_wrap(~ measure_lab, ncol = 2) +
  ggplot2::labs(x = "Date", y = "Rolling mean") +
  theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25),
  )


# Overall
daily_overall_phq <- df_time |>
  tidyr::pivot_longer(
    cols = phq4_sum,
    names_to = "measure",
    values_to = "value"
  ) |>
  dplyr::group_by(StartDate, measure) |>
  dplyr::summarise(
    n = dplyr::n(),
    mean_val = base::mean(value, na.rm = TRUE),
    sd_val = stats::sd(value, na.rm = TRUE),
    se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
    lo_val = mean_val - 1.96 * se_val,
    hi_val = mean_val + 1.96 * se_val
  )

range_dates <- base::range(daily_overall_phq$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")

p_overall_phq <- 
  ggplot2::ggplot(daily_overall_phq, ggplot2::aes(x = StartDate, y = mean_val)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
  ggplot2::geom_point(size = 0.9, colour = "#11357f") +
  ggplot2::scale_x_date(breaks = breaks_daily,
    labels = daily_overall |>
      dplyr::mutate(
        day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
        mon = base::format(StartDate, "%b"),
        label = if (day == "1") paste(day, mon) else day
      ) |>
      dplyr::pull(label),
    expand = base::c(0.01, 0.01)
  ) +
  ggplot2::labs( x = NULL, y = "PHQ-4") +
  theme_gmh +
  theme(axis.text.x = element_text(
    margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
    ), 
        panel.grid.major.x = ggplot2::element_line(
          color = "#ddeded", linewidth = 0.25
        )
  )
cowplot::plot_grid(
  p_daily_n_phq, p_cum_mean_phq,
  ncol = 1, rel_heights = c(0.35, 0.65), align = "v"
)

p_dims_phq
Error in `combine_vars()`:
! At least one layer must contain all faceting variables: `measure_lab`
✖ Plot is missing `measure_lab`
✖ Layer 1 is missing `measure_lab`
✖ Layer 2 is missing `measure_lab`
✖ Layer 3 is missing `measure_lab`
p_overall_phq

A2. MPWB Descriptives

Table 1

# Extract mpwb item labels from codebook
mpwb_measure <- codebook |>
  dplyr::filter(variable %in% mpwb_items) |>
  dplyr::pull(label, variable)

# Calculate estimates
mpwb_base1 <- df_gmh |>
  dplyr::select(ps_weight, dplyr::all_of(mpwb_items)) |>
  tidyr::pivot_longer(
    cols = -ps_weight, names_to = "variable", values_to = "value"
    ) |>
  # Calculate how many participants responded 6 or 7 (flourishing) to mpwb items
  dplyr::mutate(is_flourish = value %in% c(6, 7)) |>
  dplyr::group_by(variable) |>
  dplyr::summarise(
    mean_mpwb = Hmisc::wtd.mean(value, ps_weight, na.rm = TRUE),
    sd_mpwb = sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)),
    # is_flourish is boolean, so mean gives proportion
    pct = 100 * Hmisc::wtd.mean(is_flourish, ps_weight, na.rm = TRUE),
    n = dplyr::n()
  ) |>
  dplyr::mutate(
    dimension = mpwb_labels[variable],
    description = mpwb_measure[variable],
    mean_sd = sprintf("%.2f (%.2f)", mean_mpwb, sd_mpwb),
    pct_flourishing = sprintf("%.1f", pct),
    m = mean_mpwb,
  ) |>
  dplyr::arrange(-m) |>
  dplyr::select(dimension, description, mean_sd, pct_flourishing, n)

mpwb_table1 <- mpwb_base1 |>
  dplyr::mutate(`% flourishing` = pct_flourishing) |>
  dplyr::select(dimension, description, mean_sd, `% flourishing`) |>
  gt::gt() |>
  gt::cols_label(
    dimension = "Dimension",
    description = "Measure",
    mean_sd = gt::md("*M*<sub>weighted</sub> (*SD*)"),
    `% flourishing` = gt::md("% flourishing<sub>weighted</sub>")
  ) |>
  gt::fmt_markdown(columns = c("mean_sd", "% flourishing")) |>
  gt::tab_options(
    table.border.top.color = "black",
    table.border.top.style = "solid",
    table.border.top.width = gt::px(1),
    table.border.bottom.color = "black",
    table.border.bottom.style = "solid",
    table.border.bottom.width = gt::px(1),
    heading.border.bottom.color = "black",
    heading.border.bottom.style = "solid",
    heading.border.bottom.width = gt::px(1),
    table_body.hlines.color = "white",
    row.striping.include_table_body = FALSE
  ); mpwb_table1
Dimension Measure Mweighted (SD)
Positive relationships I receive help and support from people I am close to when I need it. 5.05 (1.46) 38.5
Meaning I feel what I do in my life is valuable and worthwhile. 4.92 (1.49) 35.6
Competence I feel a sense of accomplishment from what I do. 4.83 (1.44) 31.8
Engagement I feel absorbed in what I am doing. 4.81 (1.34) 28.6
Self-esteem I feel positive about myself. 4.80 (1.50) 32.2
Optimism I am optimistic about my future. 4.70 (1.59) 31.3
Positive emotion I feel happy. 4.69 (1.46) 27.6
Emotional stability I feel calm and peaceful. 4.46 (1.51) 23.4
Resilience I recover quickly from things that go wrong in my life. 4.44 (1.48) 22.1
Vitality I feel full of energy. 4.19 (1.56) 19.2
# Save table as image and docx
gt::gtsave(mpwb_table1, filename = "555_table1.png")
file:////var/folders/57/fnv45qmj3_v67tzm1lxyx18r0000gn/T//RtmpDIh10E/file6ab961475642.html screenshot completed
gt::gtsave(mpwb_table1, "555_table1.docx")

# Cleanup
rm(mpwb_base1, mpwb_table1)

Item Flourishing Estimates Per Country

df_gmh |>
  dplyr::select(country, ps_weight, dplyr::all_of(mpwb_items)) |>
  tidyr::pivot_longer(
    cols = -c(country, ps_weight),
    names_to = "variable",
    values_to = "value"
  ) |>
  dplyr::mutate(is_flourish = value %in% c(6, 7)) |>
  dplyr::group_by(country, variable) |>
  dplyr::summarise(
    mean_mpwb = Hmisc::wtd.mean(value, ps_weight, na.rm = TRUE),
    sd_mpwb = base::sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)),
    pct = 100 * Hmisc::wtd.mean(is_flourish, ps_weight, na.rm = TRUE),
    n = dplyr::n(),
    .groups = "drop"
  ) |>
  dplyr::mutate(
    dimension = mpwb_labels[variable],
    description = mpwb_measure[variable],
    mean_sd = sprintf("%.2f (%.2f)", mean_mpwb, sd_mpwb),
    pct_flourishing = sprintf("%.1f", pct),
    m = mean_mpwb
  ) |>
  dplyr::arrange(country, -m) |>
  dplyr::select(country, dimension, description, mean_sd, pct_flourishing, n) |>
  print_reactable(sorted_col = c("country"), width = 800)
# Cleanup
rm(mpwb_measure)

Flourishing Estimates considering PHQ-4 (rescaled)

# Using the simplified rule of scoring 60 or above on MPWB and two or below on PHQ-4, we find X% to be flourishing, ranging from X% in [lowest country] to X% in [highest country]. Future work on this data may provide more statistically robust approaches and interpretations, as well as incorporate other measures such as income and outlook.

# Flag flourishing (MPWB ≥ 60 and PHQ-4 rescaled ≤ 2)
df_flourish <- df_gmh |>
  dplyr::filter(!is.na(gad_worry)) |>
  dplyr::mutate(
    flourishing = base::as.integer(mpwb_sum >= 60 & phq4_sum_rec <= 2)
  )

table(df_flourish$phq4_sum_rec, df_flourish$flourishing)
    
         0     1
  0    947   870
  1   1695   815
  2   3362   901
  3   6208     0
  4  13246     0
  5   1858     0
  6   2246     0
  7   1111     0
  8   1552     0
  9    770     0
  10  1015     0
  11   482     0
  12  1431     0
table(df_flourish$mpwb_sum, df_flourish$flourishing)
    
        0    1
  10   59    0
  11   22    0
  12   30    0
  13   35    0
  14   56    0
  15   48    0
  16   72    0
  17   78    0
  18   95    0
  19   92    0
  20  108    0
  21  128    0
  22  160    0
  23  191    0
  24  191    0
  25  220    0
  26  259    0
  27  236    0
  28  323    0
  29  348    0
  30  400    0
  31  445    0
  32  494    0
  33  536    0
  34  636    0
  35  668    0
  36  737    0
  37  725    0
  38  884    0
  39  900    0
  40 1087    0
  41 1065    0
  42 1121    0
  43 1148    0
  44 1218    0
  45 1318    0
  46 1392    0
  47 1381    0
  48 1545    0
  49 1508    0
  50 1609    0
  51 1448    0
  52 1345    0
  53 1290    0
  54 1212    0
  55 1172    0
  56 1125    0
  57 1011    0
  58  901    0
  59  813    0
  60  426  375
  61  342  307
  62  263  286
  63  213  283
  64  208  247
  65  127  198
  66  123  180
  67   83  167
  68   82  148
  69   48  112
  70  123  283
# Aggregate weighted percentage flourishing
df_flourish |>
  dplyr::summarise(
    pct_flourishing = 
      100 * Hmisc::wtd.mean(flourishing, ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct_flourishing = base::round(pct_flourishing, 1)
  )
# A tibble: 1 × 1
  pct_flourishing
            <dbl>
1             6.5
# 4) Percentage flourishing by country
country_pct <- df_flourish |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    pct_flourishing = 
      100 * Hmisc::wtd.mean(flourishing, ps_weight, na.rm = TRUE),
  ) |>
  dplyr::mutate(
    pct_flourishing = base::round(pct_flourishing, 1)
  ) |>
  dplyr::arrange(pct_flourishing)

# Country with the most and least flourishing
country_pct |>
  dplyr::slice_max(pct_flourishing, n = 1, with_ties = FALSE)
# A tibble: 1 × 2
  country         pct_flourishing
  <chr>                     <dbl>
1 North Macedonia              24
country_pct |>
  dplyr::slice_min(pct_flourishing, n = 1, with_ties = FALSE)
# A tibble: 1 × 2
  country    pct_flourishing
  <chr>                <dbl>
1 Madagascar               0
# Sanity check
df_flourish |> 
  dplyr::filter(country == "Madagascar", mpwb_sum >= 60, phq4_sum_rec <= 2) |>
  base::nrow()
[1] 0

A3. Optional Section Completion Rates

# Overall completion rate of the optional section
df_gmh |>
  dplyr::summarise(
    n_total = dplyr::n(),
    n_optional = base::sum(n_items_after == 10),
    pct_optional = round(100 * n_optional / n_total, 1)
  )
# A tibble: 1 × 3
  n_total n_optional pct_optional
    <int>      <int>        <dbl>
1   53799      28229         52.5
# Overall completion rate of the optional section by country
opt_by_country <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n_total = dplyr::n(),
    n_optional = base::sum(n_items_after == 10),
    pct_optional = round(100 * n_optional / n_total, 1)
  )

opt_by_country |> dplyr::slice_min(pct_optional, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country n_total n_optional pct_optional
  <chr>     <int>      <int>        <dbl>
1 Japan       431         18          4.2
opt_by_country |> dplyr::slice_max(pct_optional, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country n_total n_optional pct_optional
  <chr>     <int>      <int>        <dbl>
1 Ukraine     654        530           81
opt_by_country |> 
  print_reactable(sorted_col = "country", width = 600)
# Completion time of participants that completed the optional section
df_gmh |>
  dplyr::filter(n_items_after == 10) |>
  dplyr::summarise(med = stats::median(duration_sec, na.rm = TRUE) / 60) |>
  dplyr::pull(med)
[1] 6.316667
# Completion time of participants that completed the mandatory section only
df_gmh |>
  dplyr::filter(n_items_after == 0) |>
  dplyr::summarise(med = stats::median(duration_sec, na.rm = TRUE) / 60) |>
  dplyr::pull(med)
[1] 6.8

A4. Aggregate Descriptive Information

# First, recode education and employment variables
lvl_emp <- levels(df_gmh$employment_primary)
lvl_edu <- levels(df_gmh$education_recoded_cat)

df_agg <- df_gmh |>
  dplyr::mutate(
    dur_min = duration_sec / 60,
    
    # Recode NA as Removed for sex and income variables
    sex_reviewed_cat = base::factor(
      dplyr::case_when(
        is.na(sex_reviewed_cat) ~ "Removed",
        TRUE ~ sex_reviewed_cat
      ),
      levels = c("Male","Female","Other","Removed"),
      ordered = FALSE
    ),
    
    income_merg_cat = base::factor(
      dplyr::case_when(
        base::is.na(income_merg_cat) ~ "Removed",
        TRUE ~ income_merg_cat
      ),
      levels = c(
        "No income",
        "Second decile",
        "Third decile",
        "Fourth decile",
        "Fifth decile",
        "Sixth decile",
        "Seventh decile",
        "Eighth decile",
        "Ninth decile",
        "Tenth decile",
        "Removed"
      ),
      ordered = TRUE
    ),
    
    # Consider self-employed as part-time employed
    # (31 from the sponsored dataset provided by the team representing Ireland)
    employment_primary = as.character(employment_primary),
    employment_primary = dplyr::if_else(
      employment_irl == "Self-employed",
      "Employed/working part-time (less than 25 hours per week)",
      employment_primary,
      missing = employment_primary
    ),
    employment_primary = base::factor(employment_primary, levels = lvl_emp),

    # Consider Inclusive education as Technical
    # (30 from the Peru)
    education_recoded_cat = as.character(education_recoded_cat),
    education_recoded_cat = dplyr::if_else(
      education_cat == "Inclusive education",
      "Technical",
      education_recoded_cat,
      missing = education_recoded_cat
    ),
    education_recoded_cat = factor(education_recoded_cat, levels = lvl_edu)
  )

# Function for categorical variables
tbl_block <- function(df, var_name, label, drop_na = FALSE, show_header = FALSE) {
  x <- df[[var_name]]
  keep <- if (drop_na) !is.na(x) else rep(TRUE, length(x))
  df_keep <- df[keep, , drop = FALSE]

  # unweighted counts and percentages
  tab_u <- df_keep |>
    dplyr::count(.data[[var_name]], name = "n_u") |>
    dplyr::mutate(
      level = as.character(.data[[var_name]]),
      pct_u = 100 * n_u / sum(n_u)
    )

  # weighted counts and percentages
  tab_w <- df_keep |>
    dplyr::group_by(.data[[var_name]]) |>
    dplyr::summarise(n_w = sum(ps_weight, na.rm = TRUE)) |>
    dplyr::mutate(
      level = as.character(.data[[var_name]]),
      pct_w = 100 * n_w / sum(n_w)
    )

  # join weighted/unweighted
  tab <- dplyr::full_join(tab_u, tab_w, by = "level")

  # format display
  tab <- tab |>
    dplyr::mutate(
      unweighted = paste0(
        ifelse(is.na(n_u), 0, n_u),
        " (", sprintf("%.1f", ifelse(is.na(pct_u), 0, pct_u)), "%)"
      ),
      weighted = paste0(
        round(ifelse(is.na(n_w), 0, n_w), 0),
        " (", sprintf("%.1f", ifelse(is.na(pct_w), 0, pct_w)), "%)"
      )
    )

  header <- if (show_header) "n (%)" else ""

  dplyr::bind_rows(
    tibble::tibble(Variable = label, Unweighted = header, Weighted = header),
    tab |>
      dplyr::transmute(
        Variable = paste0("\u00A0\u00A0\u00A0", level),
        Unweighted = unweighted,
        Weighted = weighted
      )
  )
}

# duration
dur_q_u <- stats::quantile(df_agg$dur_min, probs = c(0.25, 0.5, 0.75), na.rm = TRUE, names = FALSE)
dur_Md_u <- as.numeric(dur_q_u[2])
dur_IQR_u <- as.numeric(dur_q_u[3] - dur_q_u[1])
table2 <- dplyr::bind_rows(
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tibble::tibble(Variable = "", Unweighted = "M (SD)", Weighted = "M (SD)"),
  tibble::tibble(
    Variable = "MPWB sum (range 10–70)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$mpwb_sum, na.rm = TRUE), 
              stats::sd(df_agg$mpwb_sum, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)",
              Hmisc::wtd.mean(
                df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE),
              sqrt(Hmisc::wtd.var(
                df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE)))
  ),
  tibble::tibble(
    Variable = "Life satisfaction (range 1–7)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$life_satisfaction, na.rm = TRUE), 
              stats::sd(df_agg$life_satisfaction, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)", 
              Hmisc::wtd.mean(
                df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE), 
              sqrt(Hmisc::wtd.var(
                df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE)))
  ),
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tibble::tibble(Variable = "", Unweighted = "Md (IQR)", Weighted = "Md (IQR)"),
  tibble::tibble(
    Variable = "Duration (minutes)",
    Unweighted = sprintf("%.2f (%.2f)", dur_Md_u, dur_IQR_u),
    Weighted = "\u2014"
  ),
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tbl_block(df_agg, "income_merg_cat", "Household income", show_header = TRUE),
  tbl_block(df_agg, "household_size_group", "Household size"),
  tbl_block(df_agg, "age_group", "Age group"),
  tbl_block(df_agg, "sex_reviewed_cat", "Sex"),
  tbl_block(df_agg, "education_recoded_cat", "Education level"),
  tbl_block(df_agg, "employment_primary", "Employment status"),
  tbl_block(df_agg, "citizenship_cat", "Citizenship"),
  tbl_block(df_agg, "childhood_SES_cat", "Childhood socioeconomic status", drop_na = TRUE),
  tbl_block(df_agg, "work_arrangement_cat", "Work arrangement", drop_na = TRUE)
)

gt_table2 <- gt::gt(table2) |>
  gt::cols_label(
    Variable = "Variable",
    Unweighted = "Unweighted",
    Weighted = "Weighted"
  ) |>
  gt::tab_options(
    data_row.padding = gt::px(4),
    table.border.top.color = "black",
    table.border.top.width = gt::px(0.5),
    table.border.bottom.color = "black",
    table.border.bottom.width = gt::px(0.5),
    heading.border.bottom.color = "black",
    heading.border.bottom.width = gt::px(0.5),
    table_body.hlines.color = "white",
    row.striping.include_table_body = FALSE
  ) |>
  gt::tab_style(
    style = gt::cell_text(style = "italic"),
    locations = gt::cells_body(
      rows = 
        Unweighted %in% c("M (SD)", "Md (IQR)", "n (%)") |
        Weighted %in% c("M (SD)", "Md (IQR)", "n (%)"),
      columns = c("Unweighted", "Weighted")
    )
  ); gt_table2
Variable Unweighted Weighted
M (SD) M (SD)
MPWB sum (range 10–70) 47.43 (10.94) 46.87 (11.09)
Life satisfaction (range 1–7) 6.42 (2.38) 6.30 (2.46)
Md (IQR) Md (IQR)
Duration (minutes) 6.17 (4.52)
Household income n (%) n (%)
   No income 1948 (3.6%) 1509 (4.1%)
   Second decile 6194 (11.5%) 4848 (13.0%)
   Third decile 6583 (12.2%) 4839 (13.0%)
   Fourth decile 6675 (12.4%) 4518 (12.2%)
   Fifth decile 6360 (11.8%) 4342 (11.7%)
   Sixth decile 5637 (10.5%) 3870 (10.4%)
   Seventh decile 5001 (9.3%) 3239 (8.7%)
   Eighth decile 4880 (9.1%) 3195 (8.6%)
   Ninth decile 3793 (7.1%) 2385 (6.4%)
   Tenth decile 6011 (11.2%) 3880 (10.4%)
   Removed 717 (1.3%) 551 (1.5%)
Household size
   1 13846 (25.7%) 10113 (27.2%)
   2 13726 (25.5%) 9900 (26.6%)
   3 9031 (16.8%) 5974 (16.1%)
   4-5 13115 (24.4%) 8494 (22.8%)
   6-20 4081 (7.6%) 2693 (7.2%)
Age group
   18-25 11550 (21.5%) 8195 (22.0%)
   26-44 28059 (52.2%) 15684 (42.2%)
   45-64 11970 (22.2%) 10118 (27.2%)
   65-74 1776 (3.3%) 2555 (6.9%)
   75+ 444 (0.8%) 623 (1.7%)
Sex
   Male 20738 (38.5%) 16605 (44.7%)
   Female 32607 (60.6%) 20116 (54.1%)
   Other 352 (0.7%) 352 (0.9%)
   Removed 102 (0.2%) 102 (0.3%)
Education level
   Less than secondary 1346 (2.5%) 2190 (5.9%)
   Secondary 10226 (19.0%) 11556 (31.1%)
   Technical 5693 (10.6%) 4602 (12.4%)
   University 19276 (35.8%) 10454 (28.1%)
   Advanced 17258 (32.1%) 8373 (22.5%)
Employment status
   Not in paid employment (by choice/health) 3263 (6.1%) 2808 (7.6%)
   Not in paid employment (looking for work) 3883 (7.2%) 2815 (7.6%)
   Student non-working (Full or part-time) 7122 (13.2%) 4814 (12.9%)
   Employed/working full-time (25+ hours per week) 30478 (56.7%) 18986 (51.1%)
   Employed/working part-time (less than 25 hours per week) 5961 (11.1%) 4394 (11.8%)
   Retired 2288 (4.3%) 2882 (7.8%)
   Military service 804 (1.5%) 475 (1.3%)
Citizenship
   Citizen 49136 (91.3%) 33763 (90.8%)
   Non-citizen (Permanent Resident) 1400 (2.6%) 1064 (2.9%)
   Born outside country (Citizen) 1058 (2.0%) 779 (2.1%)
   Born outside country (Non-citizen, Permanent Resident) 574 (1.1%) 345 (0.9%)
   Born outside country (Non-citizen, Non-permanent Resident) 1631 (3.0%) 1224 (3.3%)
Childhood socioeconomic status
   Poor 4379 (11.4%) 3405 (12.8%)
   Below average but not poor 10022 (26.0%) 7203 (27.1%)
   Around average 14523 (37.7%) 9766 (36.7%)
   Above average but not wealthy 8439 (21.9%) 5498 (20.7%)
   Wealthy 1115 (2.9%) 715 (2.7%)
Work arrangement
   I work entirely in-person (i.e., in an office, on-site) 17150 (53.2%) 11582 (55.3%)
   I mostly work in-person, with occasional remote days 5857 (18.2%) 3572 (17.1%)
   I work about evenly in-person/remote 3375 (10.5%) 2061 (9.8%)
   I mostly work remotely, with occasional in-person days 3077 (9.5%) 1894 (9.0%)
   I work entirely remotely 2776 (8.6%) 1827 (8.7%)
gt::gtsave(gt_table2, "555_table2.png", vwidth = 1600, vheight = 2000)
file:////var/folders/57/fnv45qmj3_v67tzm1lxyx18r0000gn/T//RtmpDIh10E/file6ab976b74de5.html screenshot completed
gt::gtsave(gt_table2, "555_table2.docx")
gt::gtsave(gt_table2, "555_table2.html")
pagedown::chrome_print(
  "555_table2.html",
  output = "555_table2.pdf"
)
full_tbl_agg <- dplyr::bind_rows(
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tibble::tibble(Variable = "", Unweighted = "M (SD)", Weighted = "M (SD)"),
  tibble::tibble(
    Variable = "MPWB sum (range 10–70)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$mpwb_sum, na.rm = TRUE), 
              stats::sd(df_agg$mpwb_sum, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)",
              Hmisc::wtd.mean(
                df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE),
              sqrt(Hmisc::wtd.var(
                df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE)))
  ),
  tibble::tibble(
    Variable = "Life satisfaction (range 1–7)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$life_satisfaction, na.rm = TRUE), 
              stats::sd(df_agg$life_satisfaction, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)", 
              Hmisc::wtd.mean(
                df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE), 
              sqrt(Hmisc::wtd.var(
                df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE)))
  ),
  tibble::tibble(
    Variable = "Age (range 18–100)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$age, na.rm = TRUE), 
              stats::sd(df_agg$age, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)",
              Hmisc::wtd.mean(
                df_agg$age, df_gmh$ps_weight, na.rm = TRUE),
              sqrt(Hmisc::wtd.var(
                df_agg$age, df_gmh$ps_weight, na.rm = TRUE)))
  ),
    tibble::tibble(
    Variable = "PHQ-4 sum (range 4-28)",
    Unweighted = 
      sprintf("%.2f (%.2f)", 
              base::mean(df_agg$phq4_sum, na.rm = TRUE), 
              stats::sd(df_agg$phq4_sum, na.rm = TRUE)),
    Weighted = 
      sprintf("%.2f (%.2f)",
              Hmisc::wtd.mean(
                df_agg$phq4_sum, df_gmh$ps_weight, na.rm = TRUE),
              sqrt(Hmisc::wtd.var(
                df_agg$phq4_sum, df_gmh$ps_weight, na.rm = TRUE)))
  ),
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tibble::tibble(Variable = "", Unweighted = "Md (IQR)", Weighted = "Md (IQR)"),
  tibble::tibble(
    Variable = "Duration (minutes)",
    Unweighted = sprintf("%.2f (%.2f)", dur_Md_u, dur_IQR_u),
    Weighted = "\u2014"
  ),
  tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
  tbl_block(df_agg, "income_merg_cat", "Household income", show_header = TRUE),
  tbl_block(df_agg, "household_size_group", "Household size"),
  tbl_block(df_agg, "age_group", "Age group"),
  tbl_block(df_agg, "sex_reviewed_cat", "Sex"),
  tbl_block(df_agg, "education_recoded_cat", "Education level"),
  tbl_block(df_agg, "employment_primary", "Employment status"),
  tbl_block(df_agg, "citizenship_cat", "Citizenship"),
  tbl_block(df_agg, "childhood_SES_cat", "Childhood socioeconomic status", drop_na = TRUE),
  tbl_block(df_agg, "work_arrangement_cat", "Work arrangement", drop_na = TRUE),
  tbl_block(df_agg, "fin_outlook_cat", "Financial outlook", drop_na = TRUE),
  tbl_block(df_agg, "attention_care_cat", "Attention and Care", drop_na = TRUE)
)

gt_table2 <- gt::gt(table2) |>
  gt::cols_label(
    Variable = "Variable",
    Unweighted = "Unweighted",
    Weighted = "Weighted"
  ) |>
  gt::tab_options(
    data_row.padding = gt::px(4),
    table.border.top.color = "black",
    table.border.top.width = gt::px(0.5),
    table.border.bottom.color = "black",
    table.border.bottom.width = gt::px(0.5),
    heading.border.bottom.color = "black",
    heading.border.bottom.width = gt::px(0.5),
    table_body.hlines.color = "white",
    row.striping.include_table_body = FALSE
  ) |>
  gt::tab_style(
    style = gt::cell_text(style = "italic"),
    locations = gt::cells_body(
      rows = 
        Unweighted %in% c("M (SD)", "Md (IQR)", "n (%)") |
        Weighted %in% c("M (SD)", "Md (IQR)", "n (%)"),
      columns = c("Unweighted", "Weighted")
    )
  ); gt_table2
Variable Unweighted Weighted
M (SD) M (SD)
MPWB sum (range 10–70) 47.43 (10.94) 46.87 (11.09)
Life satisfaction (range 1–7) 6.42 (2.38) 6.30 (2.46)
Md (IQR) Md (IQR)
Duration (minutes) 6.17 (4.52)
Household income n (%) n (%)
   No income 1948 (3.6%) 1509 (4.1%)
   Second decile 6194 (11.5%) 4848 (13.0%)
   Third decile 6583 (12.2%) 4839 (13.0%)
   Fourth decile 6675 (12.4%) 4518 (12.2%)
   Fifth decile 6360 (11.8%) 4342 (11.7%)
   Sixth decile 5637 (10.5%) 3870 (10.4%)
   Seventh decile 5001 (9.3%) 3239 (8.7%)
   Eighth decile 4880 (9.1%) 3195 (8.6%)
   Ninth decile 3793 (7.1%) 2385 (6.4%)
   Tenth decile 6011 (11.2%) 3880 (10.4%)
   Removed 717 (1.3%) 551 (1.5%)
Household size
   1 13846 (25.7%) 10113 (27.2%)
   2 13726 (25.5%) 9900 (26.6%)
   3 9031 (16.8%) 5974 (16.1%)
   4-5 13115 (24.4%) 8494 (22.8%)
   6-20 4081 (7.6%) 2693 (7.2%)
Age group
   18-25 11550 (21.5%) 8195 (22.0%)
   26-44 28059 (52.2%) 15684 (42.2%)
   45-64 11970 (22.2%) 10118 (27.2%)
   65-74 1776 (3.3%) 2555 (6.9%)
   75+ 444 (0.8%) 623 (1.7%)
Sex
   Male 20738 (38.5%) 16605 (44.7%)
   Female 32607 (60.6%) 20116 (54.1%)
   Other 352 (0.7%) 352 (0.9%)
   Removed 102 (0.2%) 102 (0.3%)
Education level
   Less than secondary 1346 (2.5%) 2190 (5.9%)
   Secondary 10226 (19.0%) 11556 (31.1%)
   Technical 5693 (10.6%) 4602 (12.4%)
   University 19276 (35.8%) 10454 (28.1%)
   Advanced 17258 (32.1%) 8373 (22.5%)
Employment status
   Not in paid employment (by choice/health) 3263 (6.1%) 2808 (7.6%)
   Not in paid employment (looking for work) 3883 (7.2%) 2815 (7.6%)
   Student non-working (Full or part-time) 7122 (13.2%) 4814 (12.9%)
   Employed/working full-time (25+ hours per week) 30478 (56.7%) 18986 (51.1%)
   Employed/working part-time (less than 25 hours per week) 5961 (11.1%) 4394 (11.8%)
   Retired 2288 (4.3%) 2882 (7.8%)
   Military service 804 (1.5%) 475 (1.3%)
Citizenship
   Citizen 49136 (91.3%) 33763 (90.8%)
   Non-citizen (Permanent Resident) 1400 (2.6%) 1064 (2.9%)
   Born outside country (Citizen) 1058 (2.0%) 779 (2.1%)
   Born outside country (Non-citizen, Permanent Resident) 574 (1.1%) 345 (0.9%)
   Born outside country (Non-citizen, Non-permanent Resident) 1631 (3.0%) 1224 (3.3%)
Childhood socioeconomic status
   Poor 4379 (11.4%) 3405 (12.8%)
   Below average but not poor 10022 (26.0%) 7203 (27.1%)
   Around average 14523 (37.7%) 9766 (36.7%)
   Above average but not wealthy 8439 (21.9%) 5498 (20.7%)
   Wealthy 1115 (2.9%) 715 (2.7%)
Work arrangement
   I work entirely in-person (i.e., in an office, on-site) 17150 (53.2%) 11582 (55.3%)
   I mostly work in-person, with occasional remote days 5857 (18.2%) 3572 (17.1%)
   I work about evenly in-person/remote 3375 (10.5%) 2061 (9.8%)
   I mostly work remotely, with occasional in-person days 3077 (9.5%) 1894 (9.0%)
   I work entirely remotely 2776 (8.6%) 1827 (8.7%)
# Kish Effective Sample Size
df_gmh |>
  dplyr::summarise(
    n_obs = dplyr::n(),
    sum_w = base::sum(ps_weight, na.rm = TRUE),
    sum_w2 = base::sum(ps_weight^2, na.rm = TRUE),
    mean_w = base::mean(ps_weight, na.rm = TRUE),
    sd_w = stats::sd(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    kish_ess = (sum_w^2) / sum_w2,
    cv_w = sd_w / mean_w,
    deff = 1 + cv_w^2,
    kish_from_n_deff = n_obs / deff
  )
# A tibble: 1 × 9
  n_obs  sum_w sum_w2 mean_w  sd_w kish_ess  cv_w  deff kish_from_n_deff
  <int>  <dbl>  <dbl>  <dbl> <dbl>    <dbl> <dbl> <dbl>            <dbl>
1 53799 37175. 47006.  0.691 0.630   29400. 0.911  1.83           29399.

A5. Per-country Sample Characteristics

Show the code
# N total per country
df_n_total <- df_gmh |>
  dplyr::count(country, name = "n_total")

# age_group
df_age_group <- df_gmh |>
  dplyr::count(country, age_group, name = "n") |>
  tidyr::pivot_wider(
    names_from = age_group,
    values_from = n,
    values_fill = 0
  ) |>
  dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)

# childhood_SES_cat
df_childhood_ses <- df_gmh |>
  dplyr::filter(!is.na(childhood_SES_cat)) |>
  dplyr::count(country, childhood_SES_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = childhood_SES_cat,
    values_from = n,
    values_fill = 0
  )

# citizenship_cat
df_citizenship <- df_gmh |>
  dplyr::count(country, citizenship_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = citizenship_cat,
    values_from = n,
    values_fill = 0
  )

# education_recoded_cat
df_education <- df_gmh |>
  dplyr::filter(!is.na(education_recoded_cat)) |>
  dplyr::count(country, education_recoded_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = education_recoded_cat,
    values_from = n,
    values_fill = 0
  ) |> 
  dplyr::select(country, `Less than secondary`, 
                Secondary, Technical, University, Advanced)

# employment_primary
df_employment <- df_gmh |>
  dplyr::filter(!is.na(employment_primary)) |>
  dplyr::count(country, employment_primary, name = "n") |>
  tidyr::pivot_wider(
    names_from = employment_primary,
    values_from = n,
    values_fill = 0
  )

# sex_reviewed_cat
df_sex <- df_gmh |>
  dplyr::filter(!is.na(sex_reviewed_cat)) |>
  dplyr::count(country, sex_reviewed_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = sex_reviewed_cat,
    values_from = n,
    values_fill = 0
  )

# household size
df_household_size <- df_gmh |>
  dplyr::count(country, household_size_group, name = "n") |>
  tidyr::pivot_wider(
    names_from = household_size_group,
    values_from = n,
    values_fill = 0
  )

# financial outlook
df_fin_outlook <- df_gmh |>
  dplyr::filter(!is.na(fin_outlook_cat)) |>
  dplyr::count(country, fin_outlook_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = fin_outlook_cat,
    values_from = n,
    values_fill = 0
  )

# work_arrangement_cat
df_work_arrangement <- df_gmh |>
  dplyr::filter(!is.na(work_arrangement_cat)) |>
  dplyr::count(country, work_arrangement_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = work_arrangement_cat,
    values_from = n,
    values_fill = 0
  )

# sponsorship
df_sponsorship <- df_gmh |>
  dplyr::mutate(
    sponsorship = case_when(
      sponsored == 1 ~ "Sponsored",
      sponsored == 0 ~ "Non-sponsored",
      TRUE ~ NA_character_
    )
 ) |>
  dplyr::count(country, sponsorship, name = "n") |>
  tidyr::pivot_wider(
    names_from = sponsorship,
    values_from = n,
    values_fill = 0
  )

# income_merg_cat
# remove rows with "Student non-working (Full or part-time)" in employment_primary
df_income <- df_gmh |>
  dplyr::filter(
    employment_primary != "Student non-working (Full or part-time)" &
    !is.na(income_merg_cat)
  ) |>
  dplyr::count(country, income_merg_cat, name = "n") |>
  tidyr::pivot_wider(
    names_from = income_merg_cat,
    values_from = n,
    values_fill = 0
  ) |>
  dplyr::select(country, `No income`, 
                `Second decile`, `Third decile`, 
                `Fourth decile`, `Fifth decile`, `Sixth decile`,
                `Seventh decile`, `Eighth decile`,
                `Ninth decile`, `Tenth decile`)

# join everything by country
df_demo_counts <- df_n_total |>
  dplyr::full_join(df_age_group, by = "country") |>
  dplyr::full_join(df_childhood_ses, by = "country") |>
  dplyr::full_join(df_citizenship, by = "country") |>
  dplyr::full_join(df_education, by = "country") |>
  dplyr::full_join(df_employment, by = "country") |>
  dplyr::full_join(df_sex, by = "country") |>
  dplyr::full_join(df_household_size, by = "country") |>
  dplyr::full_join(df_fin_outlook, by = "country") |>
  dplyr::full_join(df_work_arrangement, by = "country") |>
  dplyr::full_join(df_income, by = "country") |>
  dplyr::full_join(df_sponsorship, by = "country")

write_csv(df_demo_counts, "222_countries_demographics_raw.csv")

reactable::reactable(
    df_demo_counts,
    pagination = FALSE,
    height = 650,
    width = 800,
    defaultSorted = "country",
    defaultSortOrder = "asc",
    searchable = TRUE,
    striped = TRUE,
    compact = TRUE,
    highlight = TRUE,
    
    columnGroups = list(
    colGroup(
      name = "Age group", 
      columns = colnames(df_demo_counts)[3:7]),
    colGroup(
      name = "Childhood SES", 
      columns = colnames(df_demo_counts)[8:12]),
    colGroup(
      name = "Citizenship status", 
      columns = colnames(df_demo_counts)[13:17]),
    colGroup(
      name = "Education level", 
      columns = colnames(df_demo_counts)[18:22]),
    colGroup(
      name = "Employmnet status", 
      columns = colnames(df_demo_counts)[23:29]),
    colGroup(
      name = "Sex", 
      columns = colnames(df_demo_counts)[30:32]),
    colGroup(
      name = "Household size", 
      columns = colnames(df_demo_counts)[33:37]),
    colGroup(
      name = "Financial outlook", 
      columns = colnames(df_demo_counts)[38:42]),
    colGroup(
      name = "Work arrangement", 
      columns = colnames(df_demo_counts)[43:47]),
    colGroup(
      name = "Income deciles (student non-working excluded)", 
      columns = colnames(df_demo_counts)[48:57]),
    colGroup(
      name = "Sponsorship status",
      columns = colnames(df_demo_counts)[58:59])
      ),
    defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
    defaultColDef = reactable::colDef(
      filterable = FALSE,
      vAlign = "center",
      headerVAlign = "bottom",
      class = "cell",
      headerClass = "header",
      headerStyle = list(fontSize = "13px"),
      style = list(fontSize = "13px")),
    columns = list(
      country = reactable::colDef(
        name = "Country",
        sticky = "left",
        width = 100
      ),
      n_total = reactable::colDef(
      name = "<em>N</em><sub>total</sub>",
      html = TRUE,
      width = 50
      )
    )
  )
Show the code
# Cleanup
rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses,
   df_citizenship, df_education, df_employment, df_income, df_sex, df_sponsorship,
   df_work_arrangement, df_household_size, df_fin_outlook)
Show the code
# N total per country
df_n_total <- df_gmh |>
  dplyr::count(country, name = "n_total")

# age_group
df_age_group <- df_gmh |>
  dplyr::count(country, age_group, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = age_group,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)

# childhood_SES_cat
df_childhood_ses <- df_gmh |>
  dplyr::count(country, childhood_SES_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = childhood_SES_cat,
    values_from = pct,
    values_fill = 0
  )

# citizenship_cat
df_citizenship <- df_gmh |>
  dplyr::count(country, citizenship_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = citizenship_cat,
    values_from = pct,
    values_fill = 0
  )

# education_recoded_cat
df_education <- df_gmh |>
  dplyr::filter(!is.na(education_recoded_cat)) |>
  dplyr::count(country, education_recoded_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = education_recoded_cat,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `Less than secondary`,
    Secondary,
    Technical,
    University,
    Advanced
  )

# employment_primary
df_employment <- df_gmh |>
  dplyr::filter(!is.na(employment_primary)) |>
  dplyr::count(country, employment_primary, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = employment_primary,
    values_from = pct,
    values_fill = 0
  )

# sex_reviewed_cat
df_sex <- df_gmh |>
  dplyr::filter(!is.na(sex_reviewed_cat)) |>
  dplyr::count(country, sex_reviewed_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = sex_reviewed_cat,
    values_from = pct,
    values_fill = 0
  )

# household_size_group
df_household_size <- df_gmh |>
  dplyr::count(country, household_size_group, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = household_size_group,
    values_from = pct,
    values_fill = 0
  )

# fin_outlook_cat
df_fin_outlook <- df_gmh |>
  dplyr::filter(!is.na(fin_outlook_cat)) |>
  dplyr::count(country, fin_outlook_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = fin_outlook_cat,
    values_from = pct,
    values_fill = 0
  )

# work_arrangement_cat
df_work_arrangement <- df_gmh |>
  dplyr::filter(!is.na(work_arrangement_cat)) |>
  dplyr::count(country, work_arrangement_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = work_arrangement_cat,
    values_from = pct,
    values_fill = 0
  )

# sponsorship
df_sponsorship <- df_gmh |>
  dplyr::mutate(
    sponsorship = dplyr::case_when(
      sponsored == 1 ~ "Sponsored",
      sponsored == 0 ~ "Non-sponsored",
      TRUE ~ NA_character_
    )
  ) |>
  dplyr::filter(!is.na(sponsorship)) |>
  dplyr::count(country, sponsorship, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = sponsorship,
    values_from = pct,
    values_fill = 0
  )

# income_merg_cat (student non-working excluded)
df_income <- df_gmh |>
  dplyr::filter(
    employment_primary != "Student non-working (Full or part-time)" &
      !is.na(income_merg_cat)
  ) |>
  dplyr::count(country, income_merg_cat, name = "n") |>
  dplyr::group_by(country) |>
  dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = income_merg_cat,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `No income`,
    `Second decile`,
    `Third decile`,
    `Fourth decile`,
    `Fifth decile`,
    `Sixth decile`,
    `Seventh decile`,
    `Eighth decile`,
    `Ninth decile`,
    `Tenth decile`
  )

# join everything by country
df_demo_pct <- df_n_total |>
  dplyr::full_join(df_age_group, by = "country") |>
  dplyr::full_join(df_childhood_ses, by = "country") |>
  dplyr::full_join(df_citizenship, by = "country") |>
  dplyr::full_join(df_education, by = "country") |>
  dplyr::full_join(df_employment, by = "country") |>
  dplyr::full_join(df_sex, by = "country") |>
  dplyr::full_join(df_household_size, by = "country") |>
  dplyr::full_join(df_fin_outlook, by = "country") |>
  dplyr::full_join(df_work_arrangement, by = "country") |>
  dplyr::full_join(df_income, by = "country") |>
  dplyr::full_join(df_sponsorship, by = "country")

reactable::reactable(
  df_demo_pct,
  pagination = FALSE,
  height = 650,
  width = 800,
  defaultSorted = "country",
  defaultSortOrder = "asc",
  searchable = TRUE,
  striped = TRUE,
  compact = TRUE,
  highlight = TRUE,
  columnGroups = list(
    colGroup(
      name = "Age group",
      columns = colnames(df_demo_pct)[3:7]
    ),
    colGroup(
      name = "Childhood SES",
      columns = colnames(df_demo_pct)[8:12]
    ),
    colGroup(
      name = "Citizenship status",
      columns = colnames(df_demo_pct)[13:17]
    ),
    colGroup(
      name = "Education level",
      columns = colnames(df_demo_pct)[18:22]
    ),
    colGroup(
      name = "Employmnet status",
      columns = colnames(df_demo_pct)[23:29]
    ),
    colGroup(
      name = "Sex",
      columns = colnames(df_demo_pct)[30:32]
    ),
    colGroup(
      name = "Household size",
      columns = colnames(df_demo_pct)[33:37]
    ),
    colGroup(
      name = "Financial outlook",
      columns = colnames(df_demo_pct)[38:42]
    ),
    colGroup(
      name = "Work arrangement",
      columns = colnames(df_demo_pct)[43:47]
    ),
    colGroup(
      name = "Income deciles (student non-working excluded)",
      columns = colnames(df_demo_pct)[48:57]
    ),
    colGroup(
      name = "Sponsorship status",
      columns = colnames(df_demo_pct)[58:59]
    )
  ),
  defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
  defaultColDef = reactable::colDef(
    filterable = FALSE,
    vAlign = "center",
    headerVAlign = "bottom",
    class = "cell",
    headerClass = "header",
    headerStyle = list(fontSize = "13px"),
      style = list(fontSize = "13px")),
    columns = list(
      country = reactable::colDef(
        name = "Country",
        sticky = "left",
        width = 100
      ),
      n_total = reactable::colDef(
      name = "<em>N</em><sub>total</sub>",
      html = TRUE,
      width = 50
      )
    )
  )
Show the code
# Country with the most and least female participants
df_demo_pct |>
  dplyr::slice_max(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 Estonia   90.3
Show the code
df_demo_pct |>
  dplyr::slice_min(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 Chad      22.6
Show the code
rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses,
   df_citizenship, df_education, df_employment, df_income, df_sex, df_sponsorship,
   df_work_arrangement, df_household_size, df_fin_outlook)
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Show the code
country_avg <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    mean_age = base::round(base::mean(age, na.rm = TRUE), 1),
    sd_age = base::round(stats::sd(age, na.rm = TRUE), 1),
    mean_mpwb_sum = base::round(base::mean(mpwb_sum, na.rm = TRUE), 1),
    sd_mpwb_sum = base::round(stats::sd(mpwb_sum, na.rm = TRUE), 1),
    mean_phq4_sum = base::round(base::mean(phq4_sum, na.rm = TRUE), 1),
    sd_phq4_sum = base::round(stats::sd(phq4_sum, na.rm = TRUE), 1),
    mean_ls = base::round(base::mean(life_satisfaction, na.rm = TRUE), 1),
    sd_ls = base::round(stats::sd(life_satisfaction, na.rm = TRUE), 1),
    mean_assets_USD = base::round(base::mean(assets_USD, na.rm = TRUE), 1),
    sd_assets_USD = base::round(stats::sd(assets_USD, na.rm = TRUE), 1),
    mean_debts_USD = base::round(base::mean(debts_USD, na.rm = TRUE), 1),
    sd_debts_USD = base::round(stats::sd(debts_USD, na.rm = TRUE), 1),
    mean_income_USD = 
      base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 1),
    sd_income_USD = 
      base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 1),
  )

reactable::reactable(
  country_avg,
  pagination = FALSE,
  searchable = TRUE,
  striped = TRUE,
  highlight = TRUE,
  compact = TRUE,
  class = "avg_tbl",
  columnGroups = list(
    colGroup(name = "Age", columns = c("mean_age","sd_age")),
    colGroup(name = "MPWB sum", columns = c("mean_mpwb_sum","sd_mpwb_sum")),
    colGroup(name = "PHQ-4 sum", columns = c("mean_phq4_sum","sd_phq4_sum")),
    colGroup(name = "Life satisfaction", columns = c("mean_ls","sd_ls")),
    colGroup(name = "Assets USD", columns = c("mean_assets_USD","sd_assets_USD")),
    colGroup(name = "Debts USD", columns = c("mean_debts_USD","sd_debts_USD")),
    colGroup(name = "Income USD", columns = c("mean_income_USD","sd_income_USD"))
  ),
  defaultColDef = colDef(
    headerVAlign = "bottom",
    vAlign = "center",
    headerStyle = list(fontSize = "12px"),
    style = list(fontSize = "12px"),
    filterable = FALSE
  ),
  columns = list(
    country = colDef(
      name = "Country",
      sticky = "left",
      minWidth = 100
    ),
    mean_age = colDef(name = "<i>M</i>", html = TRUE),
    sd_age = colDef(name = "<i>SD</i>", html = TRUE),
    mean_mpwb_sum = colDef(name = "<i>M</i>", html = TRUE),
    sd_mpwb_sum = colDef(name = "<i>SD</i>", html = TRUE),
    mean_phq4_sum = colDef(name = "<i>M</i>", html = TRUE),
    sd_phq4_sum = colDef(name = "<i>SD</i>", html = TRUE),
    mean_ls = colDef(name = "<i>M</i>", html = TRUE),
    sd_ls = colDef(name = "<i>SD</i>", html = TRUE),
    mean_assets_USD = colDef(name = "<i>M</i>", html = TRUE),
    sd_assets_USD = colDef(name = "<i>SD</i>", html = TRUE),
    mean_debts_USD = colDef(name = "<i>M</i>", html = TRUE),
    sd_debts_USD = colDef(name = "<i>SD</i>", html = TRUE),
    mean_income_USD = colDef(name = "<i>M</i>", html = TRUE),
    sd_income_USD = colDef(name = "<i>SD</i>", html = TRUE)
  )
)
Show the code
rm(country_avg)
Show the code
# N total per country
df_n_total <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n_total = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  )

# age_group
df_age_group <- df_gmh |>
  dplyr::group_by(country, age_group) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = age_group,
    values_from = n,
    values_fill = 0
  ) |>
  dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)

# childhood_SES_cat
df_childhood_ses <- df_gmh |>
  dplyr::group_by(country, childhood_SES_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = childhood_SES_cat,
    values_from = n,
    values_fill = 0
  )

# citizenship_cat
df_citizenship <- df_gmh |>
  dplyr::group_by(country, citizenship_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = citizenship_cat,
    values_from = n,
    values_fill = 0
  )

# education_recoded_cat
df_education <- df_gmh |>
  dplyr::filter(!base::is.na(education_recoded_cat)) |>
  dplyr::group_by(country, education_recoded_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = education_recoded_cat,
    values_from = n,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `Less than secondary`,
    Secondary,
    Technical,
    University,
    Advanced
  )

# employment_primary
df_employment <- df_gmh |>
  dplyr::filter(!base::is.na(employment_primary)) |>
  dplyr::group_by(country, employment_primary) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = employment_primary,
    values_from = n,
    values_fill = 0
  )

# sex_reviewed_cat
df_sex <- df_gmh |>
  dplyr::filter(!base::is.na(sex_reviewed_cat)) |>
  dplyr::group_by(country, sex_reviewed_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = sex_reviewed_cat,
    values_from = n,
    values_fill = 0
  )

# household size
df_household_size <- df_gmh |>
  dplyr::group_by(country, household_size_group) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = household_size_group,
    values_from = n,
    values_fill = 0
  )

# financial outlook
df_fin_outlook <- df_gmh |>
  dplyr::filter(!base::is.na(fin_outlook_cat)) |>
  dplyr::group_by(country, fin_outlook_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = fin_outlook_cat,
    values_from = n,
    values_fill = 0
  )

# work_arrangement_cat
df_work_arrangement <- df_gmh |>
  dplyr::filter(!base::is.na(work_arrangement_cat)) |>
  dplyr::group_by(country, work_arrangement_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = work_arrangement_cat,
    values_from = n,
    values_fill = 0
  )

# income_merg_cat
# remove rows with "Student non-working (Full or part-time)" in employment_primary
df_income <- df_gmh |>
  dplyr::filter(
    employment_primary != "Student non-working (Full or part-time)" &
      !base::is.na(income_merg_cat)
  ) |>
  dplyr::group_by(country, income_merg_cat) |>
  dplyr::summarise(
    n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
  ) |>
  tidyr::pivot_wider(
    names_from = income_merg_cat,
    values_from = n,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `No income`,
    `Second decile`,
    `Third decile`,
    `Fourth decile`,
    `Fifth decile`,
    `Sixth decile`,
    `Seventh decile`,
    `Eighth decile`,
    `Ninth decile`,
    `Tenth decile`
  )

# join everything by country
df_demo_countw <- df_n_total |>
  dplyr::full_join(df_age_group, by = "country") |>
  dplyr::full_join(df_childhood_ses, by = "country") |>
  dplyr::full_join(df_citizenship, by = "country") |>
  dplyr::full_join(df_education, by = "country") |>
  dplyr::full_join(df_employment, by = "country") |>
  dplyr::full_join(df_sex, by = "country") |>
  dplyr::full_join(df_household_size, by = "country") |>
  dplyr::full_join(df_fin_outlook, by = "country") |>
  dplyr::full_join(df_work_arrangement, by = "country") |>
  dplyr::full_join(df_income, by = "country")

reactable::reactable(
  df_demo_countw,
  pagination = FALSE,
  height = 650,
  width = 800,
  defaultSorted = "country",
  defaultSortOrder = "asc",
  searchable = TRUE,
  striped = TRUE,
  compact = TRUE,
  highlight = TRUE,
  columnGroups = list(
    colGroup(
      name = "Age group",
      columns = colnames(df_demo_countw)[3:7]
    ),
    colGroup(
      name = "Childhood SES",
      columns = colnames(df_demo_countw)[8:12]
    ),
    colGroup(
      name = "Citizenship status",
      columns = colnames(df_demo_countw)[13:17]
    ),
    colGroup(
      name = "Education level",
      columns = colnames(df_demo_countw)[18:22]
    ),
    colGroup(
      name = "Employmnet status",
      columns = colnames(df_demo_countw)[23:29]
    ),
    colGroup(
      name = "Sex",
      columns = colnames(df_demo_countw)[30:32]
    ),
    colGroup(
      name = "Household size",
      columns = colnames(df_demo_countw)[33:37]
    ),
    colGroup(
      name = "Financial outlook",
      columns = colnames(df_demo_countw)[38:42]
    ),
    colGroup(
      name = "Work arrangement",
      columns = colnames(df_demo_countw)[43:47]
    ),
    colGroup(
      name = "Income deciles (student non-working excluded)",
      columns = colnames(df_demo_countw)[48:57]
    )
  ),
  defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
  defaultColDef = reactable::colDef(
    filterable = FALSE,
    vAlign = "center",
    headerVAlign = "bottom",
    class = "cell",
    headerClass = "header",
    headerStyle = list(fontSize = "13px"),
      style = list(fontSize = "13px")),
    columns = list(
      country = reactable::colDef(
        name = "Country",
        sticky = "left",
        width = 100
      ),
      n_total = reactable::colDef(
      name = "<em>N</em><sub>total</sub>",
      html = TRUE,
      width = 50
      )
    )
  )
Show the code
# Country with the most and least female participants
df_demo_countw |>
  dplyr::slice_max(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 USA       1883
Show the code
df_demo_countw |>
  dplyr::slice_min(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 Chad         3
Show the code
# Cleanup
rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses,
   df_citizenship, df_education, df_employment, df_income, df_sex, df_sponsorship,
   df_work_arrangement, df_household_size, df_fin_outlook)
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_sponsorship' not found
Show the code
# N total per country
df_n_total <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n_total = base::round(base::sum(ps_weight, na.rm = TRUE), 1)
  )

# age_group
df_age_group <- df_gmh |>
  dplyr::group_by(country, age_group) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = age_group,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)

# childhood_SES_cat (weighted %)
df_childhood_ses <- df_gmh |>
  dplyr::group_by(country, childhood_SES_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = childhood_SES_cat,
    values_from = pct,
    values_fill = 0
  )

# citizenship_cat
df_citizenship <- df_gmh |>
  dplyr::group_by(country, citizenship_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = citizenship_cat,
    values_from = pct,
    values_fill = 0
  )

# education_recoded_cat
df_education <- df_gmh |>
  dplyr::filter(!is.na(education_recoded_cat)) |>
  dplyr::group_by(country, education_recoded_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = education_recoded_cat,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `Less than secondary`,
    Secondary,
    Technical,
    University,
    Advanced
  )

# employment_primary
df_employment <- df_gmh |>
  dplyr::filter(!is.na(employment_primary)) |>
  dplyr::group_by(country, employment_primary) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = employment_primary,
    values_from = pct,
    values_fill = 0
  )

# sex_reviewed_cat
df_sex <- df_gmh |>
  dplyr::group_by(country, sex_reviewed_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = sex_reviewed_cat,
    values_from = pct,
    values_fill = 0
  )

# household_size_group
df_household_size <- df_gmh |>
  dplyr::group_by(country, household_size_group) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = household_size_group,
    values_from = pct,
    values_fill = 0
  )

# fin_outlook_cat
df_fin_outlook <- df_gmh |>
  dplyr::filter(!is.na(fin_outlook_cat)) |>
  dplyr::group_by(country, fin_outlook_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = fin_outlook_cat,
    values_from = pct,
    values_fill = 0
  )

# work_arrangement_cat
df_work_arrangement <- df_gmh |>
  dplyr::filter(!is.na(work_arrangement_cat)) |>
  dplyr::group_by(country, work_arrangement_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = work_arrangement_cat,
    values_from = pct,
    values_fill = 0
  )

# income_merg_cat (student non-working excluded)
df_income <- df_gmh |>
  dplyr::filter(
    employment_primary != "Student non-working (Full or part-time)" &
      !is.na(income_merg_cat)
  ) |>
  dplyr::group_by(country, income_merg_cat) |>
  dplyr::summarise(
    wtd_n = base::sum(ps_weight, na.rm = TRUE)
  ) |>
  dplyr::mutate(
    pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
  ) |>
  dplyr::ungroup() |>
  tidyr::pivot_wider(
    id_cols = country,
    names_from = income_merg_cat,
    values_from = pct,
    values_fill = 0
  ) |>
  dplyr::select(
    country,
    `No income`,
    `Second decile`,
    `Third decile`,
    `Fourth decile`,
    `Fifth decile`,
    `Sixth decile`,
    `Seventh decile`,
    `Eighth decile`,
    `Ninth decile`,
    `Tenth decile`
  )

# join everything by country
df_demo_pctw <- df_n_total |>
  dplyr::full_join(df_age_group, by = "country") |>
  dplyr::full_join(df_childhood_ses, by = "country") |>
  dplyr::full_join(df_citizenship, by = "country") |>
  dplyr::full_join(df_education, by = "country") |>
  dplyr::full_join(df_employment, by = "country") |>
  dplyr::full_join(df_sex, by = "country") |>
  dplyr::full_join(df_household_size, by = "country") |>
  dplyr::full_join(df_fin_outlook, by = "country") |>
  dplyr::full_join(df_work_arrangement, by = "country") |>
  dplyr::full_join(df_income, by = "country")

reactable::reactable(
  df_demo_pctw,
  pagination = FALSE,
  height = 650,
  width = 800,
  defaultSorted = "country",
  defaultSortOrder = "asc",
  searchable = TRUE,
  striped = TRUE,
  compact = TRUE,
  highlight = TRUE,
  columnGroups = list(
    colGroup(
      name = "Age group",
      columns = colnames(df_demo_pctw)[3:7]
    ),
    colGroup(
      name = "Childhood SES",
      columns = colnames(df_demo_pctw)[8:12]
    ),
    colGroup(
      name = "Citizenship status",
      columns = colnames(df_demo_pctw)[13:17]
    ),
    colGroup(
      name = "Education level",
      columns = colnames(df_demo_pctw)[18:22]
    ),
    colGroup(
      name = "Employmnet status",
      columns = colnames(df_demo_pctw)[23:29]
    ),
    colGroup(
      name = "Sex",
      columns = colnames(df_demo_pctw)[30:32]
    ),
    colGroup(
      name = "Household size",
      columns = colnames(df_demo_pctw)[33:37]
    ),
    colGroup(
      name = "Financial outlook",
      columns = colnames(df_demo_pctw)[38:42]
    ),
    colGroup(
      name = "Work arrangement",
      columns = colnames(df_demo_pctw)[43:47]
    ),
    colGroup(
      name = "Income deciles (student non-working excluded)",
      columns = colnames(df_demo_pctw)[48:57]
    )
  ),
  defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
  defaultColDef = reactable::colDef(
    filterable = FALSE,
    vAlign = "center",
    headerVAlign = "bottom",
    class = "cell",
    headerClass = "header",
    headerStyle = list(fontSize = "13px"),
      style = list(fontSize = "13px")),
    columns = list(
      country = reactable::colDef(
        name = "Country",
        sticky = "left",
        width = 100
      ),
      n_total = reactable::colDef(
      name = "<em>N</em><sub>total</sub>",
      html = TRUE,
      width = 50
      )
    )
  )
Show the code
# Country with the most and least female participants
df_demo_pctw |>
  dplyr::slice_max(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 Hungary   88.6
Show the code
df_demo_pctw |>
  dplyr::slice_min(Female, n = 1, with_ties = FALSE) |> 
  dplyr::select(country, Female)
# A tibble: 1 × 2
  country Female
  <chr>    <dbl>
1 Chad      22.6
Show the code
# Cleanup
rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses,
   df_citizenship, df_education, df_employment, df_income, df_sex, df_sponsorship,
   df_work_arrangement, df_household_size, df_fin_outlook)
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_sponsorship' not found

A6. Geographic Distribution

# Sanity check: Any missing lat/long values?
df_gmh |>
  dplyr::filter(is.na(lat) | is.na(long)) |>
  base::nrow()
[1] 0
# Extract geographical data and remove Antarctica for more efficient plotting
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") |>
  base::subset(name != "Antarctica")

# Create data frame with unweighted values
by_cty <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n = dplyr::n(),
    pct_female = base::mean(sex_reviewed_cat == "Female", na.rm = TRUE) * 100,
    pct_fulltime = base::mean(
      employment_primary == "Employed/working full-time (25+ hours per week)",
      na.rm = TRUE) * 100,
    mean_age = base::mean(age, na.rm = TRUE),
    median_dur_min = stats::median(duration_sec, na.rm = TRUE) / 60
  ) |>
  arrange(desc(n))

total_row <- df_gmh |>
  dplyr::summarise(
    country = "Total",
    n = dplyr::n(),
    pct_female = base::mean(sex_reviewed_cat == "Female", na.rm = TRUE) * 100,
    pct_fulltime = base::mean(
      employment_primary == "Employed/working full-time (25+ hours per week)",
      na.rm = TRUE) * 100,
    mean_age = base::mean(age, na.rm = TRUE),
    median_dur_min = stats::median(duration_sec, na.rm = TRUE) / 60
  )

tbl_country <- dplyr::bind_rows(total_row, by_cty) |>
  dplyr::mutate(
    `% Female` = base::sprintf("%.1f%%", pct_female),
    `% Full-time employed` = base::sprintf("%.1f%%", pct_fulltime),
    `italic(M)[age]` = base::round(mean_age, 1),
    `italic(Md)[duration]` = base::round(median_dur_min, 1)
  ) |>
  dplyr::select(
    Country = country,
    `italic(n)` = n,
    `% Female`,
    `% Full-time employed`,
    `italic(M)[age]`,
    `italic(Md)[duration]`
  )

# Table aesthetics
tt <- gridExtra::ttheme_minimal(
  core = list(
    fg_params = list(
      hjust = 0,
      x = 0.02,
      fontsize = 12.5, fontfamily = "Inter",
      col = "#051520"
    ),
    padding = unit(c(7.5, 2.2), "pt")
  ),
  colhead = list(
    fg_params = list(
      hjust = 0,
      x = 0.02,
      fontsize = 12.7,
      fontface = "plain",
      col = "#051520",
      parse = TRUE
    ),
    bg_params = list(fill = NA)
  )
)

# Create table grob
tbl_grob <- 
  gridExtra::tableGrob(tbl_country, 
                       rows = NULL, 
                       theme = tt)

tbl_grob <- gtable::gtable_add_grob(
  tbl_grob,
  grobs = grid::segmentsGrob(
    x0 = grid::unit(0, "npc"),
    x1 = grid::unit(1, "npc"),
    y0 = grid::unit(1, "npc"),
    y1 = grid::unit(1, "npc")
  ),
  t = 1, l = 1, r = ncol(tbl_grob)
)

tbl_grob <- gtable::gtable_add_grob(
  tbl_grob,
  grobs = grid::segmentsGrob(
    x0 = grid::unit(0, "npc"),
    x1 = grid::unit(1, "npc"),
    y0 = grid::unit(0, "npc"),
    y1 = grid::unit(0, "npc")
  ),
  t = nrow(tbl_grob), l = 1, r = ncol(tbl_grob)
)

tbl_grob$widths[[2]] <- tbl_grob$widths[[2]] * 1.9

# Add label "B" to the table block
tables_block <- cowplot::ggdraw(ggplotify::as.ggplot(tbl_grob)) +
  draw_label("B", x = 0.002, y = 0.992, 
             hjust = 0, vjust = 1, fontface = "bold", 
             size = 30, fontfamily = "Inter", color = "#051520")
Warning: `aes_()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`
ℹ The deprecated feature was likely used in the ggplotify package.
  Please report the issue at <https://github.com/GuangchuangYu/ggplotify/issues>.
# Plot aesthetics
map_theme <- ggplot2::theme_minimal(base_family = "Inter", base_size = 20) +
  ggplot2::theme(
    text = element_text(family = "Inter", colour = "#051520"),
    panel.grid = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    panel.background = element_rect(fill = "transparent", colour = NA),
    plot.background  = element_rect(fill = "transparent", colour = NA))

p_world <- ggplot2::ggplot() +
  ggplot2::geom_sf(data = world, linewidth = 0.2, fill = "#ebedf0", color = "#a3a9bd") +
  ggplot2::geom_point(data = df_gmh, aes(long, lat), alpha = 0.45, 
                      size = 0.3, color="#082444") + 
  ggplot2::coord_sf(expand = FALSE) +
  ggplot2::labs(title = "A\n") +
  map_theme +
  ggplot2::theme(plot.title = element_text(hjust = 0.02, face = "bold", size = 30,
                                           colour = "#051520", family = "Inter"))

# Zoom Southeast Asia
sea_xlim <- c(90, 135)
sea_ylim <- c(-15, 25)

# Zoom Europe
eu_xlim  <- c(-25, 45)
eu_ylim  <- c(34, 72)

# Zoom East Asia
ea_xlim  <- c(110, 150)
ea_ylim  <- c(20, 50)

zoom_plot <- function(xlim, ylim, title_txt) {
  ggplot2::ggplot() +
    ggplot2::geom_sf(data = world, linewidth = 0.2, 
                     fill = "#ebedf0", color = "#a3a9bd") +
    ggplot2::geom_point(
      data = df_gmh |> dplyr::filter(long >= xlim[1], long <= xlim[2],
                                     lat >= ylim[1], lat <= ylim[2]),
      aes(long, lat), alpha = 0.55, size = 0.3, color="#082444"
    ) +
    ggplot2::coord_sf(xlim = xlim, ylim = ylim, expand = FALSE) +
    ggplot2::labs(title = title_txt) +
    map_theme +
    ggplot2::theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 22,
                                colour = "#051520", family = "Inter"),
      plot.margin = margin(2, 2, 2, 2))
}

p_sea <- zoom_plot(sea_xlim, sea_ylim, "Southeast Asia")
p_eu  <- zoom_plot(eu_xlim,  eu_ylim,  "Europe")
p_ea  <- zoom_plot(ea_xlim,  ea_ylim,  "East Asia")

zooms_row <- cowplot::plot_grid(p_eu, p_sea, p_ea, nrow = 1, rel_widths = c(1, 1, 1))
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
left_col  <- cowplot::plot_grid(p_world, zooms_row, ncol = 1, rel_heights = c(2.2, 1.2))
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
# Final arrangement
p_fig1 <- cowplot::plot_grid(left_col, tables_block, 
                                   nrow = 1, rel_widths = c(2.9, 1.1))

ggplot2::ggsave("555_figure1_map.png",
       plot = p_fig1, width = 3839, height = 2060, units = "px",
       dpi = 300, bg = "transparent")
p_fig1

df_int_map <- df_gmh |>
  dplyr::filter(!is.na(long), !is.na(lat)) |>
  dplyr::mutate(
    label_txt = sprintf("Location: %s", loc_admin_1))

map_int <-
  leaflet::leaflet(
    df_int_map,
    options = leaflet::leafletOptions(
      zoomControl = TRUE,
      preferCanvas = TRUE
    )
  ) |>
  leaflet::addProviderTiles("CartoDB.Positron") |>
  leaflet::addCircleMarkers(
    lng = ~long,
    lat = ~lat,
    radius = 3,
    stroke = FALSE,
    fillOpacity = 0.5,
    fillColor = "#082444",
    color = "#082444",
    label = ~label_txt,
    labelOptions = leaflet::labelOptions(
      noHide = FALSE,
      direction = "auto",
      sticky = TRUE,
      opacity = 0.9,
      style = list("font-size" = "12px")
    )
  ) |>
  leaflet.extras::addSearchOSM(
    options = leaflet.extras::searchOptions(initial = FALSE)
  ) |>
  leaflet.extras::addResetMapButton()

map_int

A7. Comparison of Sponsored vs. Non-sponsored participants

# Sanity check: View the counts of sponsored participants per country
df_gmh |>
  dplyr::filter(sponsored == 1) |>
  dplyr::group_by(country) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity check: Compare mean values of mpwb by sponsored status
df_gmh |>
  dplyr::group_by(country) |>
  dplyr::mutate(
    n_sponsored = base::sum(sponsored == 1, na.rm = TRUE)
  ) |>
  # Only include countries with at least 30 sponsored participants
  dplyr::filter(n_sponsored >= 30) |>
  dplyr::group_by(country, sponsored) |>
  dplyr::summarise(
    n = dplyr::n(),
    mean_mpwb_sum = base::mean(mpwb_sum, na.rm = TRUE), 
    .groups = "drop"
  ) |>
  tidyr::pivot_wider(
    names_from = sponsored,
    values_from = c(n, mean_mpwb_sum),
    names_prefix = "sponsored_"
  ) |>
  dplyr::mutate(
    mean_diff_abs = abs(mean_mpwb_sum_sponsored_1 - mean_mpwb_sum_sponsored_0)
  ) |>
  dplyr::arrange(-mean_diff_abs) |>
  print_reactable(sorted_col = "mean_diff_abs", width = 800)
# Fit model with random intercepts and slopes for country
df_filtered <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::mutate(
    n_sponsored = base::sum(sponsored == 1, na.rm = TRUE)
  ) |>
  # Only include countries with at least 30 sponsored participants
  dplyr::filter(n_sponsored >= 30)

model_sponsored <- lme4::lmer(
  mpwb_sum ~ sponsored + (1 + sponsored | country),
  data = df_filtered
)

summary(model_sponsored)
Linear mixed model fit by REML ['lmerMod']
Formula: mpwb_sum ~ sponsored + (1 + sponsored | country)
   Data: df_filtered

REML criterion at convergence: 110450

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.8922 -0.6108  0.0377  0.6602  2.5964 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 country  (Intercept)   7.80    2.793        
          sponsored    10.57    3.251   -0.51
 Residual             112.10   10.588        
Number of obs: 14602, groups:  country, 18

Fixed effects:
            Estimate Std. Error t value
(Intercept)  47.4337     0.6890  68.846
sponsored     1.2853     0.8154   1.576

Correlation of Fixed Effects:
          (Intr)
sponsored -0.530
car::Anova(model_sponsored, type = "2")
Analysis of Deviance Table (Type II Wald chisquare tests)

Response: mpwb_sum
           Chisq Df Pr(>Chisq)
sponsored 2.4848  1     0.1149

A8. Household size distribution

Distribution of household size groups

# Estimate weighted distribution of household size groups
df_hhsize <- df_gmh |>
  survey::svydesign(
    ids = ~1,
    weights = ~ps_weight,
    data = _,
    nest = TRUE
  )

# survey package allows to estimate CI more easily
est <- survey::svymean(~factor(household_size_group), df_hhsize, na.rm = TRUE)
ci <- stats::confint(est)

tibble::tibble(
    household_size_group = sub("^factor\\(household_size_group\\)", "", names(est)),
    percentage = round(100 * as.numeric(est),1),
    ci_l = 100 * ci[, 1],
    ci_u = 100 * ci[, 2]
  ) |>
  dplyr::mutate(
    household_size_group = trimws(household_size_group)
  ) |>
  dplyr::arrange(household_size_group)
# A tibble: 5 × 4
  household_size_group percentage  ci_l  ci_u
  <chr>                     <dbl> <dbl> <dbl>
1 1                          27.2 26.7  27.7 
2 2                          26.6 26.1  27.1 
3 3                          16.1 15.7  16.5 
4 4-5                        22.8 22.4  23.3 
5 6-20                        7.2  6.97  7.52

Distribution of single households (living alone)

df_alone <- df_gmh |>
  dplyr::mutate(
    is_alone = base::as.integer(household_size == 1)
  )

# Sanity check
table(df_alone$is_alone, useNA = "ifany")

    0     1 
39953 13846 
df_alone_w <-
  survey::svydesign(
    ids = ~1,
    weights = ~ps_weight,
    data = df_alone,
    nest = TRUE
  )

overall_w <- function(des) {
  est <- survey::svyciprop(~I(is_alone == 1), des, method = "logit", na.rm = TRUE)
  ci <- stats::confint(est)
  tibble::tibble(
    pct_living_alone = round(100 * as.numeric(est), 1),
    ci_l = round(100 * ci[1], 1),
    ci_u = round(100 * ci[2], 1)
  )
}

by_age_w <- function(des) {
  survey::svyby(
    ~I(is_alone == 1),
    ~age_group,
    des,
    survey::svyciprop,
    method = "logit",
    vartype = c("ci"),
    na.rm = TRUE,
    keep.names = FALSE
  ) |>
  dplyr::transmute(
    age_group = age_group,
    pct_living_alone = round(100 * `I(is_alone == 1)`, 1),
    ci_l = round(100 * ci_l, 1),
    ci_u = round(100 * ci_u, 1)
  )
}

# Overall 
global_overall <- df_alone_w |>
  subset(!is.na(is_alone)) |>
  overall_w() |>
  dplyr::mutate(region = "Global")

eu_overall <- df_alone_w |>
  subset(country %in% eu_countries) |>
  overall_w() |>
  dplyr::mutate(region = "EU")

uk_overall <- df_alone_w |>
  subset(country == "UK") |>
  overall_w() |>
  dplyr::mutate(region = "UK")

usa_overall <- df_alone_w |>
  subset(country == "USA") |>
  overall_w() |>
  dplyr::mutate(region = "USA")

dplyr::bind_rows(global_overall, eu_overall, uk_overall, usa_overall) |>
  dplyr::relocate(region, pct_living_alone, ci_l, ci_u) |>
  dplyr::arrange(region)
# A tibble: 4 × 4
  region pct_living_alone  ci_l  ci_u
  <chr>             <dbl> <dbl> <dbl>
1 EU                 34.7  33.6  35.8
2 Global             27.2  26.7  27.7
3 UK                 31    26.3  36.1
4 USA                32.4  30.7  34  
# By age
global_age <- df_alone_w |>
  by_age_w() |>
  dplyr::mutate(region = "Global")

eu_age <- df_alone_w |>
  subset(country %in% eu_countries) |>
  by_age_w() |>
  dplyr::mutate(region = "EU")

uk_age <- df_alone_w |>
  subset(country == "UK") |>
  by_age_w() |>
  dplyr::mutate(region = "UK")

usa_age <- df_alone_w |>
  subset(country == "USA") |>
  by_age_w() |>
  dplyr::mutate(region = "USA")

living_alone_by_age <-
  dplyr::bind_rows(global_age, eu_age, uk_age, usa_age) |>
  dplyr::relocate(region, age_group, pct_living_alone, ci_l, ci_u) |>
  dplyr::arrange(region, age_group)

living_alone_by_age |>
  dplyr::select(region, age_group, pct_living_alone) |>
  tidyr::pivot_wider(names_from = age_group, values_from = pct_living_alone) |>
  dplyr::arrange(region)
# A tibble: 4 × 6
  region `18-25` `26-44` `45-64` `65-74` `75+`
  <chr>    <dbl>   <dbl>   <dbl>   <dbl> <dbl>
1 EU        38.4    37      28      36.7  51.8
2 Global    28.8    27.3    23.2    33.2  44.4
3 UK        32.3    36.5    22.6    35.3  30.3
4 USA       44.9    32.9    26      26.6  34.3

A10. Order effects on MPWB items

Assessment if randomizing the order of MPWB items removed any confound effect of order. For this, we ignored the specific items, and computed means for the first presented item, second presented item, …, up to the tenth presented item (since there are 10 items in total).

# Identify items in CoreMPWB_DO
map_q_to_col <- c(
  "Q5" = "mpwb_competence",
  "Q7" = "mpwb_emotional_stability",
  "Q9" = "mpwb_engagement",
  "Q11" = "mpwb_meaning",
  "Q13" = "mpwb_optimism",
  "Q15" = "mpwb_positive_emotion",
  "Q17" = "mpwb_positive_relationships",
  "Q19" = "mpwb_resilience",
  "Q21" = "mpwb_self_esteem",
  "Q23" = "mpwb_vitality"
)

# determine presentation order from CoreMPWB_DO into long rows with position
order_long <- df_gmh |>
  # We only have order data for non-IRL sponsored participants
  dplyr::filter(irl == 0) |>
  dplyr::select(ResponseId, CoreMPWB_DO) |>
  dplyr::mutate(q_tokens = stringr::str_split(CoreMPWB_DO, "\\|")) |>
  tidyr::unnest(q_tokens) |>
  dplyr::mutate(q_tokens = stringr::str_trim(q_tokens)) |>
  dplyr::filter(q_tokens %in% names(map_q_to_col)) |>
  dplyr::mutate(mpwb_col = unname(map_q_to_col[q_tokens])) |>
  dplyr::group_by(ResponseId) |>
  dplyr::mutate(order_pos = dplyr::row_number()) |>
  dplyr::select(ResponseId, mpwb_col, order_pos)

resp_long <- df_gmh |>
  dplyr::select(ResponseId, dplyr::all_of(mpwb_items)) |>
  tidyr::pivot_longer(
    cols = dplyr::all_of(mpwb_items),
    names_to = "mpwb_col",
    values_to = "response"
  )

by_order <- resp_long |>
  dplyr::inner_join(order_long, by = c("ResponseId", "mpwb_col"))

# summary by presentation position
order_summary <- by_order |>
  dplyr::group_by(order_pos) |>
  dplyr::summarise(
    mean = base::mean(response, na.rm = TRUE),
    sd = stats::sd(response, na.rm = TRUE),
    n = base::sum(response),
    se = sd / sqrt(pmax(n, 1)),
    lo = mean - 1.96 * se,
    hi = mean + 1.96 * se
  ) |>
  dplyr::right_join(
    tibble::tibble(order_pos = 1:length(map_q_to_col)), by = "order_pos") |>
  dplyr::arrange(order_pos)

# linear model of response by position
df_order <- by_order |>
  dplyr::select(response, position = order_pos)

model_order <- stats::lm(response ~ position, data = df_order)
summary(model_order)

Call:
stats::lm(formula = response ~ position, data = df_order)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7659 -0.7509  0.2541  1.2491  2.2792 

Coefficients:
              Estimate Std. Error  t value             Pr(>|t|)    
(Intercept)  4.7709285  0.0044504 1072.021 < 0.0000000000000002 ***
position    -0.0050152  0.0007172   -6.992     0.00000000000271 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.494 on 525988 degrees of freedom
Multiple R-squared:  9.294e-05, Adjusted R-squared:  9.104e-05 
F-statistic: 48.89 on 1 and 525988 DF,  p-value: 0.000000000002708
# Effect size (eta squared)
format(round(lsr::etaSquared(model_order, type = 2), 4), nsmall = 4)
         eta.sq   eta.sq.part
position "0.0001" "0.0001"   
# plot presentation-order effect
ggplot2::ggplot(order_summary, ggplot2::aes(x = order_pos, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.5, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, na.rm = TRUE, colour = "#11357f") +
  ggplot2::geom_point(size = 1.4, na.rm = TRUE, colour = "#11357f") +
  ggplot2::scale_x_continuous(breaks = 1:length(map_q_to_col)) +
  ggplot2::scale_y_continuous(breaks = 4:7, limits = c(4, 7)) +
  ggplot2::labs(x = "Presentation Order", y = "MPWB average") +
  theme_gmh +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25))

ggplot2::ggplot(order_summary, ggplot2::aes(x = order_pos, y = mean)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.5, fill = "#abc7ff") +
  ggplot2::geom_line(linewidth = 0.7, na.rm = TRUE, colour = "#11357f") +
  ggplot2::geom_point(size = 1.4, na.rm = TRUE, colour = "#11357f") +
  ggplot2::scale_x_continuous(breaks = 1:length(map_q_to_col)) +
  ggplot2::labs(x = "Presentation Order", y = "MPWB average") +
  theme_gmh +
  ggplot2::theme(
    panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25))

A11. Relationship between % Female participants and MPWB sum by country

# Compute unweighted % female and mean MPWB sum per country
country_fem <- df_gmh |>
  dplyr::filter(sex_reviewed_cat %in% c("Male", "Female")) |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n_total = dplyr::n(),
    n_female = base::sum(sex_reviewed_cat == "Female"),
    pct_female = (n_female / n_total) * 100,
    mean_mpwb = base::mean(mpwb_sum, na.rm = TRUE)
  )

# Correlation
cor.test(country_fem$pct_female, country_fem$mean_mpwb, method = "pearson")

    Pearson's product-moment correlation

data:  country_fem$pct_female and country_fem$mean_mpwb
t = -0.88104, df = 90, p-value = 0.3806
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2917633  0.1145144
sample estimates:
        cor 
-0.09247186 
# Linear model
model_fem <- lm(mean_mpwb ~ pct_female, data = country_fem)
summary(model_fem)

Call:
lm(formula = mean_mpwb ~ pct_female, data = country_fem)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.1881 -2.0496 -0.0373  2.0868  8.8472 

Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept) 49.08313    1.55941  31.475 <0.0000000000000002 ***
pct_female  -0.02219    0.02519  -0.881               0.381    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.225 on 90 degrees of freedom
Multiple R-squared:  0.008551,  Adjusted R-squared:  -0.002465 
F-statistic: 0.7762 on 1 and 90 DF,  p-value: 0.3806
# Plot
country_flags <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    iso2 = tolower(dplyr::coalesce(dplyr::first(na.omit(iso2)), NA_character_)),
    alpha_country = 
      dplyr::if_else(any(ps_weight_flag == 1, na.rm = TRUE), 0.5, 1, missing = 1)
  )

country_fem <- country_fem |>
  dplyr::left_join(country_flags, by = "country")

plot_fem <- ggplot2::ggplot(country_fem, ggplot2::aes(x = pct_female, y = mean_mpwb)) +
  ggplot2::geom_point(ggplot2::aes(alpha = alpha_country),
                      shape = 21, colour = "#051520", size = 4) +
  ggfx::with_shadow(
    ggplot2::geom_point(ggplot2::aes(alpha = alpha_country), size = 4.2, alpha = 0.5, stroke = 0),
    sigma = 2, colour = "gray60", x_offset = 1, y_offset = 1
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2), size = 3.5, na.rm = TRUE) +
  ggplot2::labs(
    subtitle = paste("Pearson's r =", round(cor_test$estimate, 3)),
    x = "% Female participants",
    y = "MPWB Sum"
  ) +
  theme_gmh +
  ggplot2::guides(alpha = "none") +
  ggplot2::coord_flip()
Error: object 'cor_test' not found
plot_fem
Error: object 'plot_fem' not found

A12. Global rankings on MPWB and Life Satisfaction

Figure 2

# Basic definitions for the plot
lane_width <- 40
center_step <- 2 * lane_width
y_min_sum <- 29
y_max_sum <- 61
dy <- 0.5
overlay_alpha_on <- 0.3

# Define the labels for x-axis
mpwb_names <- c(
  mpwb_positive_relationships = "Positive\nrelationships",
  mpwb_meaning = "Meaning",
  mpwb_competence = "Competence",
  mpwb_engagement = "Engagement",
  mpwb_self_esteem = "Self-esteem",
  mpwb_positive_emotion = "Positive\nemotions",
  mpwb_optimism = "Optimism",
  mpwb_resilience = "Resilience",
  mpwb_emotional_stability = "Emotional\nstability",
  mpwb_vitality = "Vitality",
  MPWB_sum = "MPWB sum\n",
  Life_satisfaction = "Life\nsatisfaction"
)

# gradient palette on 10–70 reference (mpwb_sum)
pal10_70 <- scales::gradient_n_pal(
  colours = c("#e74c3c", "white", "#2ecc71"),
  values = scales::rescale(c(10, 40, 70), to = c(0, 1), from = c(10, 70))
)

# Build per-plot gradient rectangles on native y scale, colored by equivalent 10–70
make_bg <- function(lane_df,
                    y_min_native,
                    y_max_native,
                    dy_native,
                    native_to_10_70) {
  y_breaks <- seq(y_min_native, y_max_native - dy_native, by = dy_native)
  bg <- tidyr::expand_grid(x_id = lane_df$x_id, y0 = y_breaks) |>
    dplyr::left_join(lane_df, by = "x_id") |>
    dplyr::mutate(
      xmin = x_pos - lane_width,
      xmax = x_pos + lane_width,
      ymin = y0,
      ymax = y0 + dy_native,
      y_mid = (ymin + ymax) / 2,
      y10_70 = native_to_10_70(y_mid),
      fill = pal10_70(scales::rescale(
        y10_70, to = c(0, 1), from = c(10, 70)
      ))
    )
  bg
}

make_lane_geometry <- function(lane_df, y_min, y_max) {
  list(
    overlay = lane_df |>
      dplyr::mutate(
        xmin = x_pos - lane_width,
        xmax = x_pos + lane_width,
        ymin = y_min,
        ymax = y_max,
        overlay_alpha = if_else(x_id %% 2 == 1, overlay_alpha_on, 0)
      ),
    edges = lane_df |>
      dplyr::transmute(
        x = x_pos + lane_width,
        xend = x_pos + lane_width,
        y = y_min,
        yend = y_max
      )
  )
}

# Global weighted means for ordering and mean lines

# Item global weighted means
item_wmeans <- tibble::tibble(item = mpwb_items) |>
  dplyr::mutate(
    wmean = purrr::map_dbl(item, ~ Hmisc::wtd.mean(df_gmh[[.x]], weights = df_gmh$ps_weight, na.rm = TRUE))
  ) |>
  dplyr::arrange(dplyr::desc(wmean))

items_ordered <- item_wmeans$item
nice_items_ordered <- unname(mpwb_names[items_ordered])

# Global means (for vertical lines)
mean_sum_global <- as.numeric(Hmisc::wtd.mean(df_gmh$mpwb_sum, weights = df_gmh$ps_weight, na.rm = TRUE))
mean_ls_global <- as.numeric(Hmisc::wtd.mean(df_gmh$life_satisfaction, weights = df_gmh$ps_weight, na.rm = TRUE))

means_items_global <- item_wmeans |>
  dplyr::rename(measure = item, mean_native = wmean)

# EU-level means
df_eu <- df_gmh |>
  dplyr::filter(country %in% eu_countries)

eu_items <- tibble::tibble(measure = mpwb_items) |>
  dplyr::mutate(mean_val = purrr::map_dbl(measure, ~ Hmisc::wtd.mean(df_eu[[.x]], weights = df_eu$ps_weight, na.rm = TRUE))) |>
  dplyr::mutate(country = "EU", iso2 = "eu", flagged = FALSE)

eu_sum <- tibble::tibble(
  country = "EU",
  MPWB_sum = as.numeric(Hmisc::wtd.mean(df_eu$mpwb_sum, weights = df_eu$ps_weight, na.rm = TRUE)),
  iso2 = "eu",
  flagged = FALSE
)

eu_ls <- tibble::tibble(
  country = "EU",
  Life_satisfaction = as.numeric(Hmisc::wtd.mean(df_eu$life_satisfaction, weights = df_eu$ps_weight, na.rm = TRUE)),
  iso2 = "eu",
  flagged = FALSE
)

# Flagged countries during weighting (see Section A0.4)


# Country-level weighted means for items
means_cty_items <- df_gmh |>
  dplyr::group_by(country, iso2) |>
  dplyr::mutate(iso2 = tolower(iso2)) |>
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::all_of(mpwb_items),
      .fns = ~ Hmisc::wtd.mean(.x, weights = ps_weight, na.rm = TRUE),
      .names = "{.col}"
    )
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::all_of(mpwb_items),
    names_to = "measure",
    values_to = "mean_val"
  ) |>
  dplyr::mutate(flagged = country %in% flagged_countries) |>
  dplyr::bind_rows(eu_items)

# Country-level MPWB sum
means_cty_sum <- df_gmh |>
  dplyr::group_by(country, iso2) |>
  dplyr::summarise(
    MPWB_sum = as.numeric(Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE))
  ) |>
  dplyr::mutate(flagged = country %in% flagged_countries) |>
  dplyr::bind_rows(eu_sum)

# Country-level Life satisfaction
means_cty_ls <- df_gmh |>
  dplyr::group_by(country, iso2) |>
  dplyr::summarise(
    Life_satisfaction = as.numeric(Hmisc::wtd.mean(life_satisfaction, weights = ps_weight, na.rm = TRUE))
  ) |>
  dplyr::mutate(flagged = country %in% flagged_countries) |>
  dplyr::bind_rows(eu_ls)

# Lane positions
lane_items <- tibble::tibble(
  x_id = seq_along(items_ordered),
  label = unname(nice_items_ordered),
  x_pos = (seq_along(items_ordered) - 1) * center_step
)
lane_sum <- tibble::tibble(x_id = 1L,
                           label = unname(mpwb_names["MPWB_sum"]),
                           x_pos = 0)
lane_ls <- tibble::tibble(x_id = 1L,
                          label = unname(mpwb_names["Life_satisfaction"]),
                          x_pos = 0)

# Vertical alignment
y_min_items <- 1 + (y_min_sum - 10) / 10
y_max_items <- 1 + (y_max_sum - 10) / 10
y_min_ls <- (y_min_sum - 10) / 6
y_max_ls <- (y_max_sum - 10) / 6

# background coloring
to10_70_items <- function(y)
  10 + (y - 1) * 10
to10_70_sum <- function(y)
  y
to10_70_ls <- function(y)
  10 + y * 6

bg_items <- make_bg(
  lane_items,
  y_min_items,
  y_max_items,
  dy_native = dy / 10,
  native_to_10_70 = to10_70_items
)
bg_sum <- make_bg(lane_sum,
                    y_min_sum,
                    y_max_sum,
                    dy_native = dy,
                    native_to_10_70 = to10_70_sum)
bg_ls <- make_bg(lane_ls,
                    y_min_ls,
                    y_max_ls,
                    dy_native = dy / 6,
                    native_to_10_70 = to10_70_ls)

geo_items <- make_lane_geometry(lane_items, y_min_items, y_max_items)
geo_sum <- make_lane_geometry(lane_sum, y_min_sum, y_max_sum)
geo_ls <- make_lane_geometry(lane_ls, y_min_ls, y_max_ls)

# Jittered points
set.seed(123)
points_items <- means_cty_items |>
  dplyr::mutate(
    x_id = match(measure, items_ordered),
    x_pos = (x_id - 1) * center_step,
    x_jit = x_pos + runif(n(), -0.7 * lane_width, 0.7 * lane_width),
    y_val = mean_val,
    iso2 = tolower(iso2)
  )

# MPWB sum and Life Satisfaction points
points_sum <- means_cty_sum |>
  dplyr::transmute(
    x_jit = runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
    y_val = MPWB_sum,
    iso2 = tolower(iso2)
  ) |>
  dplyr::left_join(means_cty_sum |> dplyr::select(iso2, flagged), by = "iso2")
Adding missing grouping variables: `country`
points_ls <- means_cty_ls |>
  dplyr::transmute(
    x_jit = runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
    y_val = Life_satisfaction,
    iso2 = tolower(iso2)
  ) |>
  dplyr::left_join(means_cty_ls |> dplyr::select(iso2, flagged), by = "iso2")
Adding missing grouping variables: `country`
mean_lines_items <- means_items_global |>
  dplyr::mutate(
    x_id = match(measure, items_ordered),
    x_pos = (x_id - 1) * center_step,
    xmin = x_pos - lane_width,
    xmax = x_pos + lane_width,
    y = mean_native
  )

mean_line_sum <- tibble::tibble(xmin = -lane_width,
                                xmax = lane_width,
                                y = mean_sum_global)
mean_line_ls <- tibble::tibble(xmin = -lane_width,
                               xmax = lane_width,
                               y = mean_ls_global)

# Panel MPWB items
p_items <- ggplot2::ggplot(points_items, ggplot2::aes(x = x_jit, y = y_val)) +
  ggplot2::geom_rect(
    data = bg_items,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      fill = fill
    ),
    inherit.aes = FALSE,
    color = NA
  ) +
  ggplot2::scale_fill_identity() +
  ggplot2::geom_rect(
    data = geo_items$overlay,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      alpha = overlay_alpha
    ),
    inherit.aes = FALSE,
    fill = "white",
    color = NA
  ) +
  ggplot2::scale_alpha_identity() +
  ggplot2::geom_segment(
    data = geo_items$edges[-nrow(geo_items$edges), ],
    ggplot2::aes(
      x = x,
      xend = xend,
      y = y,
      yend = yend
    ),
    inherit.aes = FALSE,
    color = "#b5bec9",
    linewidth = 0.2
  ) +
  ggplot2::geom_segment(
    data = mean_lines_items,
    ggplot2::aes(
      x = xmin,
      xend = xmax,
      y = y,
      yend = y
    ),
    inherit.aes = FALSE,
    color = "#9cacbc",
    linewidth = 0.4,
    linetype = "solid"
  ) +
  ggplot2::geom_point(
    shape = 21,
    size = 3.2,
    color = "#0B2E55",
    na.rm = TRUE
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2),
                     size = 3,
                     na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    limits = c(
      min(lane_items$x_pos - lane_width),
      max(lane_items$x_pos + lane_width)
    ),
    breaks = lane_items$x_pos,
    labels = lane_items$label,
    expand = c(0, 0)
  ) +
  ggplot2::scale_y_continuous(breaks = pretty(c(y_min_items, y_max_items), n = 7), expand = c(0, 0)) +
  ggplot2::labs(x = NULL, y = "Scores\n") +
  ggplot2::coord_cartesian(ylim = c(y_min_items, y_max_items), clip = "off") +
  theme_gmh

# Panel MPWB sum
p_sum <- ggplot2::ggplot(points_sum, ggplot2::aes(x = x_jit, y = y_val)) +
  ggplot2::geom_rect(
    data = bg_sum,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      fill = fill
    ),
    inherit.aes = FALSE,
    color = NA
  ) +
  ggplot2::scale_fill_identity() +
  ggplot2::geom_rect(
    data = geo_sum$overlay,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      alpha = overlay_alpha
    ),
    inherit.aes = FALSE,
    fill = "white",
    color = NA
  ) +
  ggplot2::scale_alpha_identity() +
  ggplot2::geom_segment(
    data = geo_sum$edges[-nrow(geo_sum$edges), ],
    ggplot2::aes(
      x = x,
      xend = xend,
      y = y,
      yend = yend
    ),
    inherit.aes = FALSE,
    color = "#b5bec9",
    linewidth = 0.2
  ) +
  ggplot2::geom_segment(
    data = mean_line_sum,
    ggplot2::aes(
      x = xmin,
      xend = xmax,
      y = y,
      yend = y
    ),
    inherit.aes = FALSE,
    color = "#9cacbc",
    linewidth = 0.4,
    linetype = "solid"
  ) +
  ggplot2::geom_point(
    shape = 21,
    size = 3.2,
    color = "#0B2E55",
    na.rm = TRUE
  ) +
  ggplot2::geom_point(
    data = points_sum |> dplyr::filter(flagged == TRUE),
    shape = 21,
    size = 3.2,
    color = "#b5bec9",
    na.rm = TRUE
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2),
                     size = 3,
                     na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    limits = c(-lane_width, lane_width),
    breaks = 0,
    labels = mpwb_names["MPWB_sum"],
    expand = c(0, 0)
  ) +
  ggplot2::scale_y_continuous(breaks = seq(30, 60, by = 5), expand = c(0, 0)) +
  ggplot2::labs(x = NULL, y = NULL) +
  ggplot2::coord_cartesian(ylim = c(y_min_sum, y_max_sum), clip = "off") +
  theme_gmh

# Panel Life satisfaction
p_ls <- ggplot2::ggplot(points_ls, ggplot2::aes(x = x_jit, y = y_val)) +
  ggplot2::geom_rect(
    data = bg_ls,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      fill = fill
    ),
    inherit.aes = FALSE,
    color = NA
  ) +
  ggplot2::scale_fill_identity() +
  ggplot2::geom_rect(
    data = geo_ls$overlay,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      alpha = overlay_alpha
    ),
    inherit.aes = FALSE,
    fill = "white",
    color = NA
  ) +
  ggplot2::scale_alpha_identity() +
  ggplot2::geom_segment(
    data = geo_ls$edges[-nrow(geo_ls$edges), ],
    ggplot2::aes(
      x = x,
      xend = xend,
      y = y,
      yend = yend
    ),
    inherit.aes = FALSE,
    color = "#b5bec9",
    linewidth = 0.2
  ) +
  ggplot2::geom_segment(
    data = mean_line_ls,
    ggplot2::aes(
      x = xmin,
      xend = xmax,
      y = y,
      yend = y
    ),
    inherit.aes = FALSE,
    color = "#9cacbc",
    linewidth = 0.4,
    linetype = "solid"
  ) +
  ggplot2::geom_point(
    shape = 21,
    size = 3.2,
    color = "#0B2E55",
    na.rm = TRUE
  ) +
  ggplot2::geom_point(
    data = points_ls |> dplyr::filter(flagged == TRUE),
    shape = 21,
    size = 3.2,
    color = "#b5bec9",
    na.rm = TRUE
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2),
                     size = 3,
                     na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    limits = c(-lane_width, lane_width),
    breaks = 0,
    labels = mpwb_names["Life_satisfaction"],
    expand = c(0, 0)
  ) +
  ggplot2::scale_y_continuous(
    limits = c(y_min_ls, y_max_ls),
    breaks = c(3.33, 4.17, 5.00, 5.83, 6.67, 7.50, 8.33),
    expand = c(0, 0)
  ) +
  ggplot2::labs(x = NULL, y = NULL) +
  ggplot2::coord_cartesian(ylim = c(y_min_ls, y_max_ls), clip = "off")
p_fig2 <- cowplot::plot_grid(
  p_items, p_sum, p_ls,
  nrow = 1, align = "v", axis = "lr", rel_widths = c(6, 1, 1)
)

p_fig2

ragg::agg_png("555_figure2.png", width = 3839, height = 2054, res = 300)
dev.off()
quartz_off_screen 
                2 

Figure 3

dnk_nor <- 
  points_items |> 
  dplyr::filter(country == "Denmark"| country == "Norway")

p_fig3 <- ggplot2::ggplot(dnk_nor, ggplot2::aes(x = x_jit, y = y_val)) +
  ggplot2::geom_rect(
    data = bg_items,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      fill = fill
    ),
    inherit.aes = FALSE,
    color = NA
  ) +
  ggplot2::scale_fill_identity() +
  ggplot2::geom_rect(
    data = geo_items$overlay,
    ggplot2::aes(
      xmin = xmin,
      xmax = xmax,
      ymin = ymin,
      ymax = ymax,
      alpha = overlay_alpha
    ),
    inherit.aes = FALSE,
    fill = "white",
    color = NA
  ) +
  ggplot2::scale_alpha_identity() +
  ggplot2::geom_segment(
    data = geo_items$edges[-nrow(geo_items$edges), ],
    ggplot2::aes(
      x = x,
      xend = xend,
      y = y,
      yend = yend
    ),
    inherit.aes = FALSE,
    color = "#b5bec9",
    linewidth = 0.2
  ) +
  ggplot2::geom_segment(
    data = mean_lines_items,
    ggplot2::aes(
      x = xmin,
      xend = xmax,
      y = y,
      yend = y
    ),
    inherit.aes = FALSE,
    color = "#9cacbc",
    linewidth = 0.4,
    linetype = "solid"
  ) +
  ggplot2::geom_point(
    shape = 21,
    size = 5.2,
    color = "#0B2E55",
    na.rm = TRUE
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2),
                     size = 5,
                     na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    limits = c(
      min(lane_items$x_pos - lane_width),
      max(lane_items$x_pos + lane_width)
    ),
    breaks = lane_items$x_pos,
    labels = lane_items$label,
    expand = c(0, 0)
  ) +
  ggplot2::scale_y_continuous(breaks = pretty(c(y_min_items, y_max_items), n = 7), expand = c(0, 0)) +
  ggplot2::labs(x = NULL, y = "Scores\n") +
  ggplot2::coord_cartesian(ylim = c(y_min_items, y_max_items), clip = "off") +
  theme_gmh
p_fig3

ragg::agg_png("555_figure3.png", width = 3839, height = 2054, res = 300)
dev.off()
quartz_off_screen 
                2 

Life Satisfaction Distribution

# Weighted mean life satisfaction by country
country_stats <- df_gmh |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    mean_ls = Hmisc::wtd.mean(
      x = life_satisfaction,
      weights = ps_weight,
      na.rm = TRUE
    )
  )

# Weighted distribution
dist_ls <- df_gmh |>
  dplyr::group_by(country, life_satisfaction) |>
  dplyr::summarise(
    w_n = sum(ps_weight, na.rm = TRUE),
    .groups = "drop"
  ) |>
  tidyr::complete(
    country,
    life_satisfaction = 0:10,
    fill = list(w_n = 0)
  ) |>
  dplyr::group_by(country) |>
  dplyr::mutate(
    w_total = sum(w_n),
    pct = ifelse(w_total > 0, 100 * w_n / w_total, 0),
    contrib = life_satisfaction * pct / 100
  ) |>
  dplyr::ungroup() |>
  dplyr::left_join(country_stats, by = "country") |>
  dplyr::mutate(
    country = forcats::fct_reorder(country, mean_ls),
    life_satisfaction = factor(life_satisfaction, levels = 0:10)
  )

# color palette
ls_palette <- grDevices::colorRampPalette(
  c("#020C14","#0047AB","#7EA1D6","#D5ABF7", "#FFFFCC",
    "#B6E3B6", "#00A36C","#13948B", "#054732")
  )(10)

# interactive stacked bar
p_ls <- dist_ls |>
  dplyr::arrange(life_satisfaction) |>
  plotly::plot_ly(
    x = ~contrib,
    y = ~country,
    type = "bar",
    orientation = "h",
    color = ~life_satisfaction,
    colors = ls_palette,
    text = ~paste0(country, "<br>",
      "Life satisfaction Level: ", life_satisfaction, ",", 
      sprintf("%.1f", pct), "%<br>"
    ),
    hoverinfo = "text"
  ) |>
  plotly::layout(
    barmode = "stack",
    yaxis = list(
      title = ""
    ),
    xaxis = list(
      title = "Life satisfaction",
      range = c(0, 10)
    ),
    legend = list(
      orientation = "h",
      title = list(text = "Life satisfaction"),
      y = 1.05,
      x = 0
    )
  )

# add mean labels above bars
ann_df <- country_stats |>
  dplyr::mutate(
    country = forcats::fct_reorder(country, mean_ls),
    mean_label = sprintf("%.2f", mean_ls)
  )

p_ls <- p_ls |>
  plotly::layout(
    annotations = lapply(seq_len(nrow(ann_df)), function(i) {
      list(
        y = ann_df$country[i],
        x = ann_df$mean_ls[i] + 0.1,
        text = ann_df$mean_label[i],
        showarrow = FALSE,
        yanchor = "left",
        font = list(size = 8)
      )
    })
  )

p_ls
p_ls

Correlation between Life satisfaction and MPWB items

# Unweighted Pearson's r
lapply(mpwb_items, function(i) {
  test <- stats::cor.test(df_gmh[[i]], df_gmh$life_satisfaction, method = "pearson")
  tibble(
    dimension = i,
    r = unname(test$estimate),
    p = test$p.value
  )
})|>
  dplyr::bind_rows()
# A tibble: 10 × 3
   dimension                       r     p
   <chr>                       <dbl> <dbl>
 1 mpwb_positive_relationships 0.425     0
 2 mpwb_meaning                0.569     0
 3 mpwb_competence             0.553     0
 4 mpwb_engagement             0.408     0
 5 mpwb_self_esteem            0.604     0
 6 mpwb_optimism               0.611     0
 7 mpwb_positive_emotion       0.703     0
 8 mpwb_emotional_stability    0.600     0
 9 mpwb_resilience             0.497     0
10 mpwb_vitality               0.600     0
# Weighted Pearson's r
weighted_corr(df_gmh, life_satisfaction, mpwb_items, multiple = TRUE)
                          item     r      t     p
1  mpwb_positive_relationships 0.427  98.43 <.001
2                 mpwb_meaning 0.575 161.30 <.001
3              mpwb_competence 0.554 155.62 <.001
4              mpwb_engagement 0.413 102.28 <.001
5             mpwb_self_esteem 0.611 183.50 <.001
6                mpwb_optimism 0.613 183.36 <.001
7        mpwb_positive_emotion 0.709 244.83 <.001
8     mpwb_emotional_stability 0.597 177.76 <.001
9              mpwb_resilience 0.500 133.30 <.001
10               mpwb_vitality 0.598 189.27 <.001

Correlation between MPWB items and MPWB Sum

# Individual MPWB items vs MPWB sum
weighted_corr(df_gmh, mpwb_sum, mpwb_items, multiple = TRUE)
                          item     r      t     p
1  mpwb_positive_relationships 0.581 176.19 <.001
2                 mpwb_meaning 0.786 394.33 <.001
3              mpwb_competence 0.770 359.18 <.001
4              mpwb_engagement 0.636 198.18 <.001
5             mpwb_self_esteem 0.813 439.34 <.001
6                mpwb_optimism 0.786 413.00 <.001
7        mpwb_positive_emotion 0.829 500.51 <.001
8     mpwb_emotional_stability 0.768 379.80 <.001
9              mpwb_resilience 0.700 266.17 <.001
10               mpwb_vitality 0.784 429.53 <.001
# Leave-one-out correlations
purrr::map_dfr(mpwb_items, function(item) {
  x_vec <- df_gmh[[item]]
  df_tmp <- df_gmh |> dplyr::mutate(loo_total = mpwb_sum - x_vec)
  
  res <- rlang::eval_tidy(
    rlang::call2("weighted_corr", df_tmp, rlang::sym(item), rlang::sym("loo_total"))
  )

  tibble::tibble(
    dimension = item,
    r = res[[1]],
    t = res[[2]],
    p = res[[3]]
  )
})
# A tibble: 10 × 4
   dimension                   r     t      p    
   <chr>                       <chr> <chr>  <chr>
 1 mpwb_positive_relationships 0.483 132.04 <.001
 2 mpwb_meaning                0.726 288.87 <.001
 3 mpwb_competence             0.708 280.15 <.001
 4 mpwb_engagement             0.555 148.43 <.001
 5 mpwb_self_esteem            0.759 318.11 <.001
 6 mpwb_optimism               0.721 294.42 <.001
 7 mpwb_positive_emotion       0.780 383.33 <.001
 8 mpwb_emotional_stability    0.702 265.15 <.001
 9 mpwb_resilience             0.621 198.72 <.001
10 mpwb_vitality               0.720 299.57 <.001

Correlation between Ranking of Life Satisfaction and MPWB Sum by Country

# Country estimates
means_cty_rank <-
  df_gmh |>
  dplyr::group_by(country, iso2) |>
  dplyr::summarise(
    
    mpwb_mean = base::mean(mpwb_sum, na.rm = TRUE),
    ls_mean = base::mean(life_satisfaction, na.rm = TRUE),
    
    mpwb_mean_w = base::sum(mpwb_sum * ps_weight, na.rm = TRUE) /
      base::sum(ps_weight[!base::is.na(mpwb_sum)], na.rm = TRUE),
    
    ls_mean_w = base::sum(life_satisfaction * ps_weight, na.rm = TRUE) /
      base::sum(ps_weight[!base::is.na(life_satisfaction)], na.rm = TRUE),
    
    .groups = "drop"
  ) |>
  dplyr::mutate(
    iso2 = base::tolower(iso2),
    alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1),
    
    rank_ls = dplyr::min_rank(dplyr::desc(ls_mean)),
    rank_mpwb = dplyr::min_rank(dplyr::desc(mpwb_mean)),
    
    rank_ls_w = dplyr::min_rank(dplyr::desc(ls_mean_w)),
    rank_mpwb_w = dplyr::min_rank(dplyr::desc(mpwb_mean_w))
  )

means_cty_rank |>
  print_reactable(sorted_col = "rank_ls_w", width = 800)
# Correlations unweighted
rho_rank_unw <- stats::cor(
  means_cty_rank$rank_ls,
  means_cty_rank$rank_mpwb,
  method = "pearson"
)

tau_rank_unw <- stats::cor(
  means_cty_rank$rank_ls,
  means_cty_rank$rank_mpwb,
  method = "kendall"
)

r_mean_p_unw <- stats::cor(
  means_cty_rank$ls_mean,
  means_cty_rank$mpwb_mean,
  method = "pearson"
)

r_mean_s_unw <- stats::cor(
  means_cty_rank$ls_mean,
  means_cty_rank$mpwb_mean,
  method = "spearman"
)

# Correlations weighted
rho_rank_w <- stats::cor(
  means_cty_rank$rank_ls_w,
  means_cty_rank$rank_mpwb_w,
  method = "pearson"
)

tau_rank_w <- stats::cor(
  means_cty_rank$rank_ls_w,
  means_cty_rank$rank_mpwb_w,
  method = "kendall"
)

r_mean_p_w <- stats::cor(
  means_cty_rank$ls_mean_w,
  means_cty_rank$mpwb_mean_w,
  method = "pearson"
)

r_mean_s_w <- stats::cor(
  means_cty_rank$ls_mean_w,
  means_cty_rank$mpwb_mean_w,
  method = "spearman"
)

# Summary
tibble::tibble(
    weighting = c("unweighted", "weighted"),
    rho_rank = c(rho_rank_unw, rho_rank_w),
    tau_rank = c(tau_rank_unw, tau_rank_w),
    r_mean_p = c(r_mean_p_unw, r_mean_p_w),
    r_mean_s = c(r_mean_s_unw, r_mean_s_w)
  ) |>
  dplyr::mutate(
    dplyr::across(
      dplyr::where(base::is.numeric),
      ~ base::round(.x, 3)
    )
  )
# A tibble: 2 × 5
  weighting  rho_rank tau_rank r_mean_p r_mean_s
  <chr>         <dbl>    <dbl>    <dbl>    <dbl>
1 unweighted    0.832    0.656    0.842    0.832
2 weighted      0.853    0.683    0.853    0.853
Show the code
# Weighted Ranks (1 = highest)
ggplot2::ggplot(means_cty_rank, 
                ggplot2::aes(x = rank_ls_w, y = rank_mpwb_w)) +
  ggplot2::geom_smooth(
    method = "lm",
    se = FALSE,
    color = "#6F7C91",
    linewidth = 0.8
  ) +
  ggplot2::labs(
    subtitle = bquote("Pearson's r" == .(sprintf("%.3f", rho_rank_w))),
    x = "Life Satisfaction Rank",
    y = "MPWB Sum Rank"
  ) +
  ggplot2::geom_point(
    ggplot2::aes(alpha = alpha_country),
    shape = 21,
    colour = "#051520",
    size = 3.8
  ) +
  ggfx::with_shadow(
    ggplot2::geom_point(
      ggplot2::aes(alpha = alpha_country),
      size = 4.2,
      alpha = 0.5,
      stroke = 0
    ),
    sigma = 2,
    colour = "gray60",
    x_offset = 1,
    y_offset = 1
  ) +
  ggflags::geom_flag(
    ggplot2::aes(country = iso2),
    size = 3.5,
    na.rm = TRUE
  ) +
  ggplot2::theme(
    legend.position = "none",
    panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.line.x = element_blank()
  ) +
  ggplot2::guides(alpha = "none") +
  ggplot2::coord_flip() +
  ggplot2::scale_x_reverse() +
  ggplot2::scale_y_reverse()

Show the code
# Weighted Means
ggplot2::ggplot(means_cty_rank, ggplot2::aes(x = ls_mean_w, y = mpwb_mean_w)) +
  ggplot2::geom_smooth(
    method = "lm",
    se = FALSE,
    color = "#6F7C91",
    linewidth = 0.8
  ) +
  ggplot2::labs(
    subtitle = bquote("Pearson's r" == .(sprintf("%.2f", r_mean_p_w))),
    x = "Life Satisfaction",
    y = "MPWB Sum"
  ) +
  ggplot2::geom_point(
    ggplot2::aes(alpha = alpha_country),
    shape = 21,
    colour = "#051520",
    size = 3.8
  ) +
  ggfx::with_shadow(
    ggplot2::geom_point(
      ggplot2::aes(alpha = alpha_country),
      size = 4.2,
      alpha = 0.5,
      stroke = 0
    ),
    sigma = 2,
    colour = "gray60",
    x_offset = 1,
    y_offset = 1
  ) +
  ggflags::geom_flag(
    ggplot2::aes(country = iso2),
    size = 3.5,
    na.rm = TRUE
  ) +
  ggplot2::theme(
    legend.position = "none",
    panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.line.x = element_blank()
  ) +
  ggplot2::guides(alpha = "none") +
  ggplot2::coord_flip()

# Compare with ladder item from World Happiness Report
whr <- tribble(
  ~iso2, ~ladder_score,
  "FI", 7.736,
  "DK", 7.521,
  "SE", 7.345,
  "NL", 7.306,
  "NO", 7.262,
  "IL", 7.234,
  "MX", 6.979,
  "AU", 6.974,
  "CH", 6.935,
  "BE", 6.91,
  "IE", 6.889,
  "AT", 6.81,
  "CA", 6.803,
  "SI", 6.792,
  "CZ", 6.775,
  "AE", 6.759,
  "DE", 6.753,
  "GB", 6.728,
  "US", 6.724,
  "PL", 6.673,
  "TW", 6.669,
  "UY", 6.661,
  "XK", 6.659,
  "KW", 6.629,
  "RS", 6.606,
  "SA", 6.6,
  "FR", 6.593,
  "SG", 6.565,
  "RO", 6.563,
  "BR", 6.494,
  "ES", 6.466,
  "EE", 6.417,
  "IT", 6.415,
  "AR", 6.397,
  "KZ", 6.378,
  "CL", 6.361,
  "TH", 6.222,
  "SK", 6.221,
  "LV", 6.207,
  "OM", 6.197,
  "UZ", 6.193,
  "JP", 6.147,
  "BA", 6.136,
  "PH", 6.107,
  "KR", 6.038,
  "BH", 6.03,
  "PT", 6.013,
  "EC", 5.965,
  "MY", 5.955,
  "PE", 5.947,
  "RU", 5.945,
  "CY", 5.942,
  "CN", 5.921,
  "HU", 5.915,
  "ME", 5.877,
  "HR", 5.87,
  "BO", 5.868,
  "KG", 5.858,
  "MN", 5.833,
  "MD", 5.819,
  "GR", 5.776,
  "ID", 5.617,
  "DZ", 5.571,
  "BG", 5.554,
  "MK", 5.503,
  "AM", 5.494,
  "HK", 5.491,
  "AL", 5.411,
  "GE", 5.4,
  "TR", 5.262,
  "ZA", 5.213,
  "MZ", 5.19,
  "IR", 5.093,
  "NG", 4.885,
  "SN", 4.856,
  "PK", 4.768,
  "UA", 4.68,
  "MA", 4.622,
  "UG", 4.461,
  "IN", 4.389,
  "TD", 4.384,
  "MG", 4.157,
  "ZM", 3.912,
  "ET", 3.898,
  "BD", 3.851,
  "EG", 3.817,
  "YE", 3.561,
  "ZW", 3.396,
  "LB", 3.188
) |>
  dplyr::mutate(
    iso2 = base::tolower(iso2)
  )

means_cty_rank_whr <- means_cty_rank |>
  dplyr::left_join(whr, by = c("iso2")) |>
  dplyr::mutate(
    rank_whr = dplyr::min_rank(dplyr::desc(ladder_score))
  ) |>
  dplyr::filter(!is.na(ladder_score))

stats::cor(
  means_cty_rank_whr$rank_ls_w,
  means_cty_rank_whr$rank_whr,
  method = "pearson"
) |> round(3)
[1] 0.032
stats::cor(
  means_cty_rank_whr$rank_ls_w,
  means_cty_rank_whr$rank_whr,
  method = "kendall"
) |> round(3)
[1] 0.022
stats::cor(
  means_cty_rank_whr$ls_mean_w,
  means_cty_rank_whr$ladder_score,
  method = "pearson"
) |> round(3)
[1] 0.129
stats::cor(
  means_cty_rank_whr$ls_mean_w,
  means_cty_rank_whr$ladder_score,
  method = "spearman"
) |> round(3)
[1] 0.031
# Plot
ggplot2::ggplot(means_cty_rank_whr, 
                ggplot2::aes(x = rank_ls_w, y = rank_whr)) +

  ggplot2::labs(
    subtitle = bquote("Pearson's r" == .(sprintf("%.3f", rho_rank_w_whr))),
    x = "Life Satisfaction Rank",
    y = "Ladder Rank (World Happiness Report)"
  ) +
  ggplot2::geom_point(
    ggplot2::aes(alpha = alpha_country),
    shape = 21,
    colour = "#051520",
    size = 3.8
  ) +
  ggfx::with_shadow(
    ggplot2::geom_point(
      ggplot2::aes(alpha = alpha_country),
      size = 4.2,
      alpha = 0.5,
      stroke = 0
    ),
    sigma = 2,
    colour = "gray60",
    x_offset = 1,
    y_offset = 1
  ) +
  ggflags::geom_flag(
    ggplot2::aes(country = iso2),
    size = 3.5,
    na.rm = TRUE
  ) +
  ggplot2::theme(
    legend.position = "none",
    panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.line.x = element_blank()
  ) +
  ggplot2::guides(alpha = "none") +
  ggplot2::coord_flip() +
  ggplot2::scale_x_reverse() +
  ggplot2::scale_y_reverse()
Error in eval(e[[2L]], where): object 'rho_rank_w_whr' not found

MPWB Sum by Income Decile across Countries

df_income <-
  df_gmh |>
  dplyr::filter(
    employment_primary != "Student non-working (Full or part-time)",
    !base::is.na(income_merg_cat)
  ) |>
  dplyr::mutate(
    iso2 = base::tolower(iso2)
  )

# income weighted means
means_cty_income <-
  df_income |>
  dplyr::group_by(country, income_merg_cat, iso2) |>
  dplyr::summarise(
    n = dplyr::n(),
    mpwb_sum_income = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
    .groups = "drop"
  ) |>
  dplyr::mutate(
    lane_label = base::as.character(income_merg_cat),
    flagged = country %in% flagged_countries
  ) |>
  # remove groups with less than 20 participants (unweighted)
  dplyr::filter(n >= 20)

eu_income <-
  df_income |>
  dplyr::filter(country %in% eu_countries) |>
  dplyr::group_by(income_merg_cat) |>
  dplyr::summarise(
    mpwb_sum_income = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
    .groups = "drop"
  ) |>
  dplyr::mutate(
    country = "EU",
    iso2 = "eu",
    lane_label = base::as.character(income_merg_cat),
    n = NA_integer_,
    flagged = FALSE
  )

means_cty_income <-
  dplyr::bind_rows(means_cty_income, eu_income)

# lane layout
lane_income <-
  tibble::tibble(
    x_id  = base::seq_along(income_order),
    label = income_order,
    x_pos = (x_id - 1) * center_step
  )
Error: object 'income_order' not found
bg_income  <- make_bg(
  lane_income,
  y_min_sum,
  y_max_sum,
  dy_native = dy,
  native_to_10_70 = to10_70_sum
)
Error: object 'lane_income' not found
geo_income <- make_lane_geometry(
  lane_income,
  y_min_sum,
  y_max_sum
)
Error: object 'lane_income' not found
mean_lines_income <-
  df_income |>
  dplyr::group_by(income_merg_cat) |>
  dplyr::summarise(
    y = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
    .groups = "drop"
  ) |>
  dplyr::mutate(label = base::as.character(income_merg_cat)) |>
  dplyr::left_join(lane_income, by = "label") |>
  dplyr::transmute(
    xmin = x_pos - lane_width,
    xmax = x_pos + lane_width,
    y = y
  )
Error: object 'lane_income' not found
points_income <-
  means_cty_income |>
  dplyr::mutate(
    x_id  = base::match(lane_label, lane_income$label),
    x_pos = (x_id - 1) * center_step,
    x_jit = x_pos + stats::runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
    y_val = mpwb_sum_income
  )
Error in `dplyr::mutate()`:
ℹ In argument: `x_id = base::match(lane_label, lane_income$label)`.
Caused by error:
! object 'lane_income' not found
ggplot2::ggplot(points_income, ggplot2::aes(x = x_jit, y = y_val)) +
  ggplot2::geom_rect(
    data = bg_income,
    ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
    inherit.aes = FALSE,
    color = NA
  ) +
  ggplot2::scale_fill_identity() +
  ggplot2::geom_rect(
    data = geo_income$overlay,
    ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, alpha = overlay_alpha),
    inherit.aes = FALSE,
    fill = "white",
    color = NA
  ) +
  ggplot2::scale_alpha_identity() +
  ggplot2::geom_segment(
    data = geo_income$edges[-base::nrow(geo_income$edges),],
    ggplot2::aes(x = x, xend = xend, y = y, yend = yend),
    inherit.aes = FALSE,
    color = "#b5bec9",
    linewidth = 0.2
  ) +
  ggplot2::geom_segment(
    data = mean_lines_income,
    ggplot2::aes(x = xmin, xend = xmax, y = y, yend = y),
    inherit.aes = FALSE,
    color = "#9cacbc",
    linewidth = 0.4,
    linetype = "solid"
  ) +
  ggplot2::geom_point(
    shape = 21,
    size = 3.2,
    color = "#0B2E55",
    na.rm = TRUE
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(points_income, flagged),
    shape = 21,
    size = 3.2,
    color = "#b5bec9",
    na.rm = TRUE
  ) +
  ggflags::geom_flag(ggplot2::aes(country = iso2), size = 3, na.rm = TRUE) +
  ggplot2::scale_x_continuous(
    limits = c(
      base::min(lane_income$x_pos - lane_width),
      base::max(lane_income$x_pos + lane_width)
    ),
    breaks = lane_income$x_pos,
    labels = lane_income$label,
    expand = c(0,0)
  ) +
  ggplot2::scale_y_continuous(
    breaks = base::seq(25, 60, by = 5),
    expand = c(0,0)
  ) +
  ggplot2::labs(
    x = NULL,
    y = "MPWB sum\n"
  ) +
  ggplot2::coord_cartesian(ylim = c(y_min_sum, y_max_sum), clip = "off")
Error: object 'points_income' not found
points_income
Error: object 'points_income' not found
ragg::agg_png("555_mpwb_by_income_cntry.png", width = 3839, height = 2054, res = 300)
dev.off()
quartz_off_screen 
                2 

A13. Differences in well-being between main groups.

Figure 4

# Relabel variables for Figure 4
df_fig4 <- df_gmh

# Sex: Additional spaces to force desired balanced
df_fig4$sex_reviewed_cat <- base::factor(
  df_fig4$sex_reviewed_cat,
  levels = c("", "Male", " ", "  ", "Female","   ","    ", "Other","     ")
)

# Employment: Add line breaks for better fitting
emp_lvls <- levels(df_fig4$employment_primary)
emp_new  <- emp_lvls |>
  stringr::str_replace(
    "Employed/working full-time \\(25\\+ hours per week\\)",
    "\nEmployed/working full-time\n(25+ hours per week)\n"
  ) |>
  stringr::str_replace(
    "Employed/working part-time \\(less than 25 hours per week\\)",
    "Employed/working part-time\n(less than 25 hours per week)"
  ) |>
  stringr::str_replace(
    "^Full-time Student / Part-time Student without employment$",
    "Student non-working\n(Full or part-time)"
  ) |>
  stringr::str_replace(
    "^Not in paid employment \\(by choice/health\\)$",
    "Not in paid employment\n(by choice/health)\n"
  ) |>
  stringr::str_replace(
    "^Not in paid employment \\(looking for work\\)$",
    "Not in paid employment\n(looking for work)\n"
  )

df_fig4$employment_primary <- 
  base::factor(df_fig4$employment_primary, levels = emp_lvls, labels = emp_new)

# Education: Add line breaks for better fitting
edu_lvls <- levels(df_fig4$education_recoded_cat)
edu_new  <- edu_lvls |>
  stringr::str_replace(
    "Less than secondary",
    "Less than\nsecondary"
  )

df_fig4$education_recoded_cat <- 
  base::factor(df_fig4$education_recoded_cat, levels = edu_lvls, labels = edu_new)

# Work arrangement: Add line breaks for better fitting
work_lvls <- levels(df_fig4$work_arrangement_cat_nostudents)
work_new  <- work_lvls |>
  stringr::str_replace(
    "^I work entirely in-person \\(i\\.e\\., in an office, on-site\\)$",
    "I work entirely in-person\n(i.e., in an office, on-site)"
  ) |>
  stringr::str_replace(
    "^I work about evenly in-person/remote$",
    "I work about evenly\nin-person/remote"
  ) |>
  stringr::str_replace(
    "^I mostly work remotely, with occasional in-person days$",
    "I mostly work remotely,\nwith occasional in-person days"
  ) |>
  stringr::str_replace(
    "^I mostly work in-person, with occasional remote days$",
    "I mostly work in-person,\nwith occasional remote days"
  )

df_fig4$work_arrangement_cat_nostudents <- 
  base::factor(df_fig4$work_arrangement_cat_nostudents, 
               levels = work_lvls, labels = work_new)

# Citizenship: Add line breaks for better fitting
cit_lvls <- levels(df_fig4$citizenship_cat)
cit_new  <- cit_lvls |>
  stringr::str_replace(
    "^Non-citizen \\(Permanent Resident\\)$",
    "Non-citizen (Permanent\nResident)"
  ) |>
  stringr::str_replace(
    "^Born outside country \\(Citizen\\)$",
    "Born outside country\n(Citizen)\n"
  ) |>
  stringr::str_replace(
    "^Born outside country \\(Non-citizen, Permanent Resident\\)$",
    "Born outside country\n(Non-citizen,\nPermanent Resident)\n"
  ) |>
  stringr::str_replace(
    "^Born outside country \\(Non-citizen, Non-permanent Resident\\)$",
    "\nBorn outside country\n(Non-citizen,\nNon-permanent Resident)\n"
  )

df_fig4$citizenship_cat <- factor(df_fig4$citizenship_cat, levels = cit_lvls, labels = cit_new)

# Compute weighted means and CIs per group
# We will use survey package for this for simplicity in computing CIs
dsgn <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = df_fig4)

weighted_summary <- function(varname, label) {
  lvls <- levels(df_fig4[[varname]])
  tmp <- survey::svyby(
    ~mpwb_sum, as.formula(paste0("~", varname)),
    dsgn, survey::svymean, vartype = "ci", keep.names = FALSE
  ) |>
    as.data.frame()
  
  names(tmp)[1] <- "group"
  tibble::tibble(group = lvls) |>
    dplyr::left_join(tmp, by = "group") |>
    dplyr::mutate(variable = label, group = factor(group, levels = rev(lvls)))
}

# Assemble
all_sum <- dplyr::bind_rows(
  weighted_summary("age_group", "Age"),
  weighted_summary("sex_reviewed_cat", "Sex"),
  weighted_summary("education_recoded_cat", "Education Level"),
  weighted_summary("employment_primary", "Employment Status"),
  weighted_summary("income_merg_group", "Household Income Level"),
  weighted_summary("childhood_SES_cat", "Childhood Socioeconomic Status"),
  weighted_summary("citizenship_cat", "Citizenship"),
  weighted_summary("household_size_group", "Household Size"),
  weighted_summary("work_arrangement_cat_nostudents", "Work Arrangement")
) |>
  dplyr::mutate(
    variable = base::factor(
      variable,
      levels = c(
        "Age",
        "Sex",
        "Education Level",
        "Employment Status",
        "Household Income Level",
        "Childhood Socioeconomic Status",
        "Citizenship",
        "Household Size",
        "Work Arrangement"
      )
    )
  ) |>
  dplyr::group_by(variable) |>
  dplyr::arrange(group, .by_group = TRUE) |>
  dplyr::mutate(
    alt = dplyr::row_number() %% 2,
    point_c = ifelse(alt == 1, "#051265", "#16786d"),
    ci_c = ifelse(alt == 1, "#4d87ff", "#4fd1c4")
  ) |>
  dplyr::ungroup()

# Plot
p_fig4 <- 
  ggplot2::ggplot(all_sum, ggplot2::aes(y = group, x = mpwb_sum)) +
  ggplot2::geom_vline(
    xintercept = mean_mpwb, color = "#BCCBCA", linewidth = 0.45, linetype = "dashed"
  ) +
  ggplot2::geom_errorbar(
    ggplot2::aes(xmin = ci_l, xmax = ci_u, color = I(ci_c)), 
    orientation = "y", linewidth = 0.8, width = 0.15, na.rm = TRUE
  ) +
  ggplot2::geom_point(
    ggplot2::aes(color = I(point_c)), size = 1.1, stroke = 0.2, na.rm = TRUE
  ) +
  ggfx::with_shadow(
    geom_point(aes(color = I(point_c)), size = 1.6, alpha = 1, stroke = 0.2),
    sigma = 3, colour = "gray60", x_offset = 1, y_offset = 1
  ) +
  ggplot2::facet_wrap(~variable, ncol = 3, scales = "free_y") +
  ggplot2::labs(x = "\nMPWB Sum", y = NULL) +
  theme_gmh +
  ggplot2::theme(
    panel.grid.major.x = 
      ggplot2::element_line(color = "#ECF3F3", linetype = "solid", linewidth = 0.25),
    panel.grid.minor.x = 
      ggplot2::element_line(color = "#ECF3F3", linetype = "solid", linewidth = 0.25),
    strip.text = ggplot2::element_text(color = "#051520", face = "bold"),
    plot.margin = ggplot2::margin(15, 25, 15, 15),
    panel.spacing.x = grid::unit(6, "lines")
  )
Error: object 'mean_mpwb' not found
tiff(filename = "555_figure4.tiff", width = 17, height = 10, units = "in", res = 300)
p_fig4
Error: object 'p_fig4' not found
dev.off()
quartz_off_screen 
                2 

MPWB and Income

df_dem <- df_gmh |> 
  dplyr::filter(!is.na(income_merg)) |>
  dplyr::mutate(
    household_size_z = scale(household_size, center = TRUE, scale = TRUE),
    income_merg_group_n = dplyr::case_when(
      income_merg_group == "No income" ~ 0,
      income_merg_group == "Low" ~ 1,
      income_merg_group == "Mid" ~ 2,
      income_merg_group == "Upper" ~ 3,
      income_merg_group == "Wealthiest" ~ 4
    )
  )

# Correlation between income and mpwb
weighted_corr(df_dem, income_merg_group_n, mpwb_sum)
      r     t     p
1 0.127 28.64 <.001
# Regression
svy_dem <-
  survey::svydesign(ids = ~ 1, data = df_dem, weights = ~ ps_weight)

model_inc <- 
  survey::svyglm(mpwb_sum ~ income_merg_group, design = svy_dem)

summary(model_inc)

Call:
svyglm(formula = mpwb_sum ~ income_merg_group, design = svy_dem)

Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)         46.74625    0.09535 490.258 < 0.0000000000000002 ***
income_merg_group.L  4.52028    0.27462  16.460 < 0.0000000000000002 ***
income_merg_group.Q -0.18397    0.24347  -0.756               0.4499    
income_merg_group.C  0.42761    0.16961   2.521               0.0117 *  
income_merg_group^4 -0.52972    0.13556  -3.908            0.0000933 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 120.6656)

Number of Fisher Scoring iterations: 2
print_summ(model_inc, svy_dem, "mpwb_sum", "income_merg_group")
# A tibble: 1 × 7
  Ward_F   df1   df2 p      r2     cohens_f percent_var_explained
  <chr>  <int> <dbl> <chr>  <chr>  <chr>    <chr>                
1 117.50     4 53077 < .001 0.0168 0.1307   1.6793               
# Pairwise comparisons consecutive income groups
emm_inc <- emmeans::emmeans(
  model_inc,
  ~ income_merg_group
); emm_inc
 income_merg_group emmean    SE    df lower.CL upper.CL
 No income          43.59 0.378 53077    42.85    44.33
 Low                45.89 0.113 53077    45.67    46.11
 Mid                46.46 0.138 53077    46.20    46.73
 Upper              48.21 0.117 53077    47.98    48.44
 Wealthiest         49.58 0.197 53077    49.19    49.96

Confidence level used: 0.95 
adj_contr <-
  emmeans::contrast(
    emm_inc,
    method = "consec",
    adjust = "holm"
  )

summary(
    adj_contr,
    infer = TRUE
  )
 contrast           estimate    SE    df lower.CL upper.CL t.ratio p.value
 Low - No income       2.299 0.395 53077    1.313     3.29   5.825  <.0001
 Mid - Low             0.575 0.178 53077    0.131     1.02   3.233  0.0012
 Upper - Mid           1.743 0.181 53077    1.292     2.19   9.643  <.0001
 Wealthiest - Upper    1.371 0.229 53077    0.798     1.94   5.980  <.0001

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 4 estimates 
P value adjustment: holm method for 4 tests 

MPWB, Income and Household size

# Correlation
weighted_corr(df_dem, household_size, mpwb_sum)
      r     t     p
1 0.126 28.55 <.001
weighted_corr(df_dem, income_merg_group_n, household_size)
      r     t     p
1 0.129 27.65 <.001
# Regression household size
model_hh_main <- 
  survey::svyglm(mpwb_sum ~ household_size_group, design = svy_dem)

summary(model_hh_main)

Call:
svyglm(formula = mpwb_sum ~ household_size_group, design = svy_dem)

Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)

Coefficients:
                       Estimate Std. Error t value             Pr(>|t|)    
(Intercept)            47.36664    0.07059 671.043 < 0.0000000000000002 ***
household_size_group.L  3.73118    0.17145  21.762 < 0.0000000000000002 ***
household_size_group.Q -0.38256    0.16966  -2.255             0.024147 *  
household_size_group.C  0.51847    0.13877   3.736             0.000187 ***
household_size_group^4 -0.35829    0.14904  -2.404             0.016220 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 120.138)

Number of Fisher Scoring iterations: 2
print_summ(model_hh_main, svy_dem, "mpwb_sum", "household_size_group")
# A tibble: 1 × 7
  Ward_F   df1   df2 p      r2     cohens_f percent_var_explained
  <chr>  <int> <dbl> <chr>  <chr>  <chr>    <chr>                
1 148.28     4 53077 < .001 0.0211 0.1468   2.1091               
# Pairwise comparisons consecutive household size groups
emm_hh <- emmeans::emmeans(
  model_hh_main,
  ~ household_size_group
); emm_hh
 household_size_group emmean    SE    df lower.CL upper.CL
 1                      44.6 0.139 53077     44.3     44.9
 2                      46.8 0.125 53077     46.5     47.0
 3                      47.3 0.165 53077     47.0     47.6
 4-5                    48.5 0.127 53077     48.2     48.7
 6-20                   49.6 0.215 53077     49.2     50.1

Confidence level used: 0.95 
adj_contr <-
  emmeans::contrast(
    emm_hh,
    method = "consec",
    adjust = "holm"
  )

summary(
    adj_contr,
    infer = TRUE
  )
 contrast       estimate    SE    df lower.CL upper.CL t.ratio p.value
 2 - 1             2.193 0.187 53077  1.72597     2.66  11.736  <.0001
 3 - 2             0.526 0.207 53077  0.00877     1.04   2.540  0.0111
 (4-5) - 3         1.178 0.208 53077  0.65769     1.70   5.655  <.0001
 (6-20) - (4-5)    1.151 0.250 53077  0.52696     1.77   4.607  <.0001

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 4 estimates 
P value adjustment: holm method for 4 tests 
# Regression income and household size
model_inc_hh_main <- 
  survey::svyglm(mpwb_sum ~ income_merg_group + household_size_z, design = svy_dem)

summary(model_inc_hh_main)

Call:
svyglm(formula = mpwb_sum ~ income_merg_group + household_size_z, 
    design = svy_dem)

Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)         46.76915    0.09531 490.715 < 0.0000000000000002 ***
income_merg_group.L  4.04430    0.27596  14.655 < 0.0000000000000002 ***
income_merg_group.Q -0.32627    0.24334  -1.341             0.179987    
income_merg_group.C  0.35690    0.16963   2.104             0.035386 *  
income_merg_group^4 -0.51333    0.13502  -3.802             0.000144 ***
household_size_z     1.23345    0.06856  17.991 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 119.1795)

Number of Fisher Scoring iterations: 2
print_summ(
  model_inc_hh_main, svy_dem, "mpwb_sum", "income_merg_group + household_size_z")
Error in solve.default(V): 'a' is 0-diml
# Regression interaction
model_inc_hh_int <- 
  survey::svyglm(mpwb_sum ~ income_merg_group * household_size_z, design = svy_dem)

summary(model_inc_hh_int)

Call:
svyglm(formula = mpwb_sum ~ income_merg_group * household_size_z, 
    design = svy_dem)

Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)

Coefficients:
                                     Estimate Std. Error t value             Pr(>|t|)    
(Intercept)                          46.78676    0.09772 478.800 < 0.0000000000000002 ***
income_merg_group.L                   4.21537    0.28326  14.882 < 0.0000000000000002 ***
income_merg_group.Q                  -0.35103    0.25021  -1.403             0.160636    
income_merg_group.C                   0.45699    0.17280   2.645             0.008181 ** 
income_merg_group^4                  -0.51683    0.13517  -3.823             0.000132 ***
household_size_z                      1.05511    0.10051  10.498 < 0.0000000000000002 ***
income_merg_group.L:household_size_z -0.11972    0.29405  -0.407             0.683896    
income_merg_group.Q:household_size_z -0.71939    0.25789  -2.790             0.005280 ** 
income_merg_group.C:household_size_z  0.46439    0.17749   2.616             0.008887 ** 
income_merg_group^4:household_size_z  0.15260    0.13252   1.152             0.249493    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 119.0505)

Number of Fisher Scoring iterations: 2
print_summ(
  model_inc_hh_int, svy_dem, "mpwb_sum", "income_merg_group * household_size_z")
Error in solve.default(V): 'a' is 0-diml
car::vif(model_inc_hh_int)
there are higher-order terms (interactions) in this model
consider setting type = 'predictor'; see ?vif
                                       GVIF Df GVIF^(1/(2*Df))
income_merg_group                  1.192992  4        1.022303
household_size_z                   2.481244  1        1.575196
income_merg_group:household_size_z 2.765384  4        1.135584
# Simple slopes of income at different household sizes
slopes_inc_hh_int <- emmeans::emtrends(
    model_inc_hh_int,
    ~ income_merg_group,
    var = "household_size_z",
    adjust = "holm"
  )

# Plot
interactions::interact_plot(
  model_inc_hh_int,
  pred = household_size_z,
  modx = income_merg_group,
  interval = TRUE
)

MPWB, Income and Childhood SES

df_inc_ses <- df_gmh |> 
  dplyr::filter(!is.na(income_merg) & !is.na(childhood_SES)) |>
  dplyr::mutate(
    income_merg_group_n = dplyr::case_when(
      income_merg_group == "No income" ~ 0,
      income_merg_group == "Low" ~ 1,
      income_merg_group == "Mid" ~ 2,
      income_merg_group == "Upper" ~ 3,
      income_merg_group == "Wealthiest" ~ 4
    )
  )

weighted_corr(df_inc_ses, childhood_SES, mpwb_sum)
      r     t     p
1 0.135 26.26 <.001
weighted_corr(df_inc_ses, income_merg_group_n, childhood_SES)
      r     t     p
1 0.173 34.44 <.001
svy_inc_ses <-
  survey::svydesign(
    ids = ~1,
    data = df_inc_ses,
    weights = ~ps_weight
  )

model_ses <-
  survey::svyglm(
    mpwb_sum ~ childhood_SES_cat,
    design = svy_inc_ses
  )

summary(model_ses)

Call:
svyglm(formula = mpwb_sum ~ childhood_SES_cat, design = svy_inc_ses)

Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)

Coefficients:
                    Estimate Std. Error t value            Pr(>|t|)    
(Intercept)         46.54214    0.12214 381.060 <0.0000000000000002 ***
childhood_SES_cat.L  4.49242    0.35785  12.554 <0.0000000000000002 ***
childhood_SES_cat.Q -0.43210    0.30947  -1.396               0.163    
childhood_SES_cat.C -0.04965    0.22557  -0.220               0.826    
childhood_SES_cat^4  0.39595    0.15377   2.575               0.010 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 123.7373)

Number of Fisher Scoring iterations: 2
print_summ(model_ses, svy_inc_ses, "mpwb_sum", "childhood_SES_cat")
# A tibble: 1 × 7
  Ward_F   df1   df2 p      r2     cohens_f percent_var_explained
  <chr>  <int> <dbl> <chr>  <chr>  <chr>    <chr>                
1 87.27      4 38108 < .001 0.0190 0.1392   1.9012               
# Pairwise comparisons childhood SES groups
emm_ses <- emmeans::emmeans(
  model_ses,
  ~ childhood_SES_cat
); emm_ses
 childhood_SES_cat             emmean    SE    df lower.CL upper.CL
 Poor                            43.5 0.260 38108     43.0     44.0
 Below average but not poor      45.0 0.156 38108     44.7     45.3
 Around average                  47.1 0.123 38108     46.8     47.3
 Above average but not wealthy   47.9 0.161 38108     47.6     48.2
 Wealthy                         49.2 0.490 38108     48.2     50.1

Confidence level used: 0.95 
adj_contr <-
  emmeans::contrast(
    emm_ses,
    method = "pairwise",
    adjust = "holm"
  )

summary(
    adj_contr,
    infer = TRUE
  )
 contrast                                                   estimate    SE    df lower.CL upper.CL t.ratio p.value
 Poor - Below average but not poor                            -1.483 0.303 38108    -2.33   -0.632  -4.893  <.0001
 Poor - Around average                                        -3.524 0.287 38108    -4.33   -2.718 -12.265  <.0001
 Poor - Above average but not wealthy                         -4.387 0.306 38108    -5.25   -3.530 -14.358  <.0001
 Poor - Wealthy                                               -5.651 0.555 38108    -7.21   -4.094 -10.190  <.0001
 Below average but not poor - Around average                  -2.041 0.199 38108    -2.60   -1.483 -10.269  <.0001
 Below average but not poor - Above average but not wealthy   -2.904 0.224 38108    -3.53   -2.274 -12.947  <.0001
 Below average but not poor - Wealthy                         -4.168 0.514 38108    -5.61   -2.724  -8.104  <.0001
 Around average - Above average but not wealthy               -0.863 0.202 38108    -1.43   -0.295  -4.266  0.0001
 Around average - Wealthy                                     -2.127 0.505 38108    -3.54   -0.709  -4.211  0.0001
 Above average but not wealthy - Wealthy                      -1.264 0.516 38108    -2.71    0.184  -2.450  0.0143

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 10 estimates 
P value adjustment: holm method for 10 tests 
# Income and childhood SES Main effects
model_income_ses <-
  survey::svyglm(
    mpwb_sum ~ income_merg_group + childhood_SES_cat,
    design = svy_inc_ses
  )
summary(model_income_ses)

Call:
svyglm(formula = mpwb_sum ~ income_merg_group + childhood_SES_cat, 
    design = svy_inc_ses)

Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)

Coefficients:
                    Estimate Std. Error t value             Pr(>|t|)    
(Intercept)         46.25256    0.14290 323.671 < 0.0000000000000002 ***
income_merg_group.L  4.58593    0.31873  14.388 < 0.0000000000000002 ***
income_merg_group.Q -0.29520    0.28515  -1.035             0.300566    
income_merg_group.C  0.25879    0.19894   1.301             0.193309    
income_merg_group^4 -0.59832    0.16103  -3.716             0.000203 ***
childhood_SES_cat.L  3.69037    0.35855  10.292 < 0.0000000000000002 ***
childhood_SES_cat.Q -0.52252    0.30809  -1.696             0.089888 .  
childhood_SES_cat.C -0.06313    0.22405  -0.282             0.778142    
childhood_SES_cat^4  0.45573    0.15264   2.986             0.002831 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 121.5788)

Number of Fisher Scoring iterations: 2
print_summ(
  model_income_ses, svy_inc_ses, "mpwb_sum", "income_merg_group + childhood_SES_cat")
Error in solve.default(V): 'a' is 0-diml
# Income and childhood SES Interaction
model_income_ses_int <-
  survey::svyglm(
    mpwb_sum ~ income_merg_group * childhood_SES_cat,
    design = svy_inc_ses
  )
summary(model_income_ses_int)

Call:
svyglm(formula = mpwb_sum ~ income_merg_group * childhood_SES_cat, 
    design = svy_inc_ses)

Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)

Coefficients:
                                        Estimate Std. Error t value             Pr(>|t|)    
(Intercept)                             46.26483    0.15304 302.313 < 0.0000000000000002 ***
income_merg_group.L                      4.63870    0.39961  11.608 < 0.0000000000000002 ***
income_merg_group.Q                     -0.15220    0.38343  -0.397             0.691415    
income_merg_group.C                      0.32351    0.27458   1.178             0.238725    
income_merg_group^4                     -0.98679    0.29377  -3.359             0.000783 ***
childhood_SES_cat.L                      3.53421    0.43822   8.065 0.000000000000000754 ***
childhood_SES_cat.Q                     -0.68714    0.38177  -1.800             0.071886 .  
childhood_SES_cat.C                      0.05104    0.29275   0.174             0.861599    
childhood_SES_cat^4                      0.24021    0.21194   1.133             0.257063    
income_merg_group.L:childhood_SES_cat.L -0.93085    1.11900  -0.832             0.405492    
income_merg_group.Q:childhood_SES_cat.L  0.22070    1.09446   0.202             0.840188    
income_merg_group.C:childhood_SES_cat.L -0.53745    0.78566  -0.684             0.493935    
income_merg_group^4:childhood_SES_cat.L -2.10395    0.87946  -2.392             0.016747 *  
income_merg_group.L:childhood_SES_cat.Q  0.40277    0.98252   0.410             0.681859    
income_merg_group.Q:childhood_SES_cat.Q -0.18342    0.95475  -0.192             0.847651    
income_merg_group.C:childhood_SES_cat.Q  0.23270    0.68409   0.340             0.733735    
income_merg_group^4:childhood_SES_cat.Q -1.25365    0.75504  -1.660             0.096847 .  
income_merg_group.L:childhood_SES_cat.C -0.02718    0.78728  -0.035             0.972464    
income_merg_group.Q:childhood_SES_cat.C  1.48587    0.73619   2.018             0.043565 *  
income_merg_group.C:childhood_SES_cat.C  0.25358    0.52702   0.481             0.630403    
income_merg_group^4:childhood_SES_cat.C -1.39863    0.52391  -2.670             0.007597 ** 
income_merg_group.L:childhood_SES_cat^4  0.34907    0.59708   0.585             0.558801    
income_merg_group.Q:childhood_SES_cat^4 -0.58874    0.53759  -1.095             0.273458    
income_merg_group.C:childhood_SES_cat^4 -0.20592    0.38059  -0.541             0.588472    
income_merg_group^4:childhood_SES_cat^4 -0.79241    0.32864  -2.411             0.015907 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 121.4091)

Number of Fisher Scoring iterations: 2
print_summ(
  model_income_ses_int, svy_inc_ses, 
  "mpwb_sum", "income_merg_group * childhood_SES_cat")
Error in solve.default(V): 'a' is 0-diml
anova(model_income_ses, model_income_ses_int)
Working (Rao-Scott+F) LRT for income_merg_group:childhood_SES_cat
 in svyglm(formula = mpwb_sum ~ income_merg_group * childhood_SES_cat, 
    design = svy_inc_ses)
Working 2logLR =  26.03592 p= 0.058265 
(scale factors:  1.7 1.4 1.2 1.1 1.1 1.1 1 0.94 0.92 0.89 0.84 0.79 0.79 0.77 0.72 0.69 );  denominator df= 38088
AIC(model_income_ses, model_income_ses_int)
        eff.p      AIC deltabar
[1,] 16.88628 291156.6 1.876253
[2,] 49.53248 291168.7 1.981299

A14. Working Conditions and Mental Health

A16. Indications of Mental Illness

PHQ-4 Descriptives

df_phq <- df_gmh |>
  dplyr::filter(!is.na(gad_worry))

# Global
df_long_phq <-
  df_phq |>
  dplyr::select(ps_weight, dplyr::starts_with("gad"), dplyr::starts_with("phq")) |>
  dplyr::select(-phq4_cat) |>
  tidyr::pivot_longer(
    cols = -ps_weight,
    names_to = "variable",
    values_to = "value"
  )

summ_phq <-
  df_long_phq |>
  dplyr::group_by(variable) |>
  dplyr::summarise(
    unweighted_n = base::round(base::sum(!is.na(value)), 2),
    unweighted_mean = base::round(base::mean(value, na.rm = TRUE), 2),
    unweighted_sd = base::round(stats::sd(value, na.rm = TRUE), 2),
    unweighted_median = base::round(stats::median(value, na.rm = TRUE), 2),
    unweighted_iqr = base::round(stats::IQR(value, na.rm = TRUE), 2),
    weighted_n = base::round(base::sum(ps_weight), 2),
    weighted_mean = base::round(Hmisc::wtd.mean(
      value, ps_weight,
      na.rm = TRUE
    ), 2),
    weighted_sd = base::round(sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)), 2),
    q25 = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.25,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    weighted_median = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.5,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    q75 = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.75,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    .groups = "drop"
  ) |>
  dplyr::mutate(
    weighted_iqr = q75 - q25
  ) |>
  dplyr::select(
    variable,
    unweighted_n,
    unweighted_mean,
    unweighted_sd,
    unweighted_median,
    unweighted_iqr,
    weighted_n,
    weighted_mean,
    weighted_sd,
    weighted_median,
    weighted_iqr
  )

summ_phq |>
  print_reactable(sorted_col = "variable", width = 800)
# Within-country
df_long_phq <-
  df_phq |>
  dplyr::select(
    country,
    ps_weight,
    dplyr::starts_with("gad"),
    dplyr::starts_with("phq")
  ) |>
  dplyr::select(-phq4_cat) |>
  tidyr::pivot_longer(
    cols = -c(country, ps_weight),
    names_to = "variable",
    values_to = "value"
  )

summ_phq_cty <-
  df_long_phq |>
  dplyr::group_by(country, variable) |>
  dplyr::summarise(
    unweighted_n = base::round(base::sum(!is.na(value)), 2),
    unweighted_mean = base::round(base::mean(value, na.rm = TRUE), 2),
    unweighted_sd = base::round(stats::sd(value, na.rm = TRUE), 2),
    unweighted_median = base::round(stats::median(value, na.rm = TRUE), 2),
    unweighted_iqr = base::round(stats::IQR(value, na.rm = TRUE), 2),
    weighted_n = base::round(base::sum(ps_weight[!is.na(value)]), 2),
    weighted_mean = base::round(Hmisc::wtd.mean(
      value,
      weights = ps_weight,
      na.rm = TRUE
    ), 2),
    weighted_sd = base::round(base::sqrt(
      Hmisc::wtd.var(
        value,
        weights = ps_weight,
        na.rm = TRUE
      )
    ), 2),
    q25 = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.25,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    weighted_median = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.5,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    q75 = base::round(Hmisc::wtd.quantile(
      value,
      weights = ps_weight,
      probs = 0.75,
      na.rm = TRUE,
      normwt = FALSE
    ), 2),
    .groups = "drop"
  ) |>
  dplyr::mutate(
    weighted_iqr = q75 - q25
  ) |>
  dplyr::select(
    country,
    variable,
    unweighted_n,
    unweighted_mean,
    unweighted_sd,
    unweighted_median,
    unweighted_iqr,
    weighted_n,
    weighted_mean,
    weighted_sd,
    weighted_median,
    weighted_iqr
  )

summ_phq_cty |>
  print_reactable(sorted_col = "country", width = 800)

Prevalence Estimates (Percentages) of PHQ-2, GAD-2, and PHQ-4

# Global
dplyr::summarise(df_phq,
    n_uw = dplyr::n(),
    n_w_eff = 
      (base::sum(ps_weight, na.rm = TRUE)^2) / base::sum(ps_weight^2, na.rm = TRUE),
    sum_w = 
      base::sum(ps_weight, na.rm=TRUE),

    phq2_3_unw = 
      base::round(base::mean(phq2_sum_rec >= 3, na.rm = TRUE) * 100, 2),
    phq2_3_w = 
      base::round(Hmisc::wtd.mean(as.numeric(phq2_sum_rec >= 3), ps_weight) * 100, 2),
    
    gad2_3_unw = 
      base::round(base::mean(gad2_sum_rec >= 3,  na.rm = TRUE) * 100, 2),
    gad2_3_w = 
      base::round(Hmisc::wtd.mean(as.numeric(gad2_sum_rec >= 3), ps_weight) * 100, 2),

    phq4_3_unw = 
      base::round(mean(phq4_sum_rec >= 3 & phq4_sum_rec <=5, na.rm = TRUE) * 100, 2),
    phq4_3_w = 
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 3 & phq4_sum_rec <=5, ps_weight) * 100, 2),

    phq4_6_unw = 
      base::round(mean(phq4_sum_rec >= 6 & phq4_sum_rec <=8, na.rm = TRUE) * 100, 2),
    phq4_6_w = 
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 6 & phq4_sum_rec <=8, ps_weight) * 100),
    
    phq4_9_unw = 
      base::round(mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE) * 100, 2),
    phq4_9_w = 
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 9 & phq4_sum_rec <=12, ps_weight) * 100, 2)
) |>
  tidyr::pivot_longer(
    cols = everything(),
    names_to = "measure",
    values_to = "prevalence"
  )
# A tibble: 13 × 2
   measure    prevalence
   <chr>           <dbl>
 1 n_uw          38509  
 2 n_w_eff       20971. 
 3 sum_w         26610. 
 4 phq2_3_unw       20.3
 5 phq2_3_w         21.0
 6 gad2_3_unw       22.1
 7 gad2_3_w         22.6
 8 phq4_3_unw       55.3
 9 phq4_3_w         53.7
10 phq4_6_unw       12.8
11 phq4_6_w         13  
12 phq4_9_unw        9.6
13 phq4_9_w         10.2
# Within-country
within_phq <-
  df_phq |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    n_uw = dplyr::n(),
    n_w_eff = 
      base::round(
        (base::sum(ps_weight, na.rm = TRUE)^2) /
          base::sum(ps_weight^2, na.rm = TRUE), 
        2),
    sum_w = base::round(base::sum(ps_weight, na.rm = TRUE), 2),

    # PHQ-2
    phq2_3_unw =
      base::round(base::mean(depression_screen, na.rm = TRUE) * 100, 2),
    phq2_3_w =
      base::round(Hmisc::wtd.mean(depression_screen, ps_weight, na.rm = TRUE) * 100, 2),
    phq2_3_pos_uw =
      base::sum(depression_screen == 1, na.rm = TRUE),
    phq2_3_pos_sum_w =
      base::round(base::sum(ps_weight[depression_screen == 1 & !is.na(depression_screen)], na.rm = TRUE), 0),

    # GAD-2
    gad2_3_unw =
      base::round(base::mean(anxiety_screen, na.rm = TRUE) * 100, 2),
    gad2_3_w =
      base::round(Hmisc::wtd.mean(anxiety_screen, ps_weight, na.rm = TRUE) * 100, 2),
    gad2_3_pos_uw =
      base::sum(anxiety_screen == 1, na.rm = TRUE),
    gad2_3_pos_w =
      base::round(base::sum(ps_weight[anxiety_screen == 1 & !is.na(anxiety_screen)], na.rm = TRUE), 2),

    # PHQ-4 mild (3–5)
    phq4_3_unw =
      base::round(base::mean(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, na.rm = TRUE) * 100, 2),
    phq4_3_w =
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, ps_weight, na.rm = TRUE) * 100, 2),
    phq4_3_pos_uw =
      base::sum(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, na.rm = TRUE),
    phq4_3_pos_w =
      base::round(base::sum(ps_weight[phq4_sum_rec >= 3 & phq4_sum_rec <= 5 & !is.na(phq4_sum_rec)], na.rm = TRUE), 2),

    # PHQ-4 moderate (6–8)
    phq4_6_unw =
      base::round(base::mean(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, na.rm = TRUE) * 100, 2),
    phq4_6_w =
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, ps_weight, na.rm = TRUE) * 100, 2),
    phq4_6_pos_uw =
      base::sum(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, na.rm = TRUE),
    phq4_6_pos_w =
      base::round(base::sum(ps_weight[phq4_sum_rec >= 6 & phq4_sum_rec <= 8 & !is.na(phq4_sum_rec)], na.rm = TRUE), 0),

    # PHQ-4 severe (9–12)
    phq4_9_unw =
      base::round(base::mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE) * 100, 2),
    phq4_9_w =
      base::round(Hmisc::wtd.mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, ps_weight, na.rm = TRUE) * 100, 2),
    phq4_9_pos_uw =
      base::sum(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE),
    phq4_9_pos_w =
      base::round(base::sum(ps_weight[phq4_sum_rec >= 9 & phq4_sum_rec <= 12 & !is.na(phq4_sum_rec)], na.rm = TRUE), 0)
  )

within_phq |> print_reactable(sorted_col = "country", width = 800)
plot_df <-
  within_phq |>
  dplyr::mutate(
    country = forcats::fct_reorder(country, gad2_3_w, .desc = TRUE)
  ) |>
  tidyr::pivot_longer(
    cols = c(phq2_3_w, gad2_3_w),
    names_to = "screen",
    values_to = "estimate"
  ) |>
  dplyr::mutate(
    screen = dplyr::recode(
      screen,
      phq2_3_w = "PHQ-2 ≥ 3",
      gad2_3_w = "GAD-2 ≥ 3"
    )
  )

ggplot2::ggplot(plot_df, ggplot2::aes(x = estimate, y = country)) +
  ggplot2::geom_point() +
  ggplot2::labs(
    x = "Screen-positive (%)",
    y = "",
  ) +
  ggplot2::facet_wrap(~screen, ncol = 2, scales = "free_x") +
  theme(panel.grid.major.y = ggplot2::element_line(
        color = "#ddeded", linewidth = 0.25)
  )

## -------------------------------------------------------------------
## 2) p_phq4_country: PHQ-4 mean by country (+ severity cutpoints)
## -------------------------------------------------------------------
phq4_country <- df_gmh %>%
  filter(!is.na(country), !is.na(phq4_sum)) %>%
  group_by(country) %>%
  summarise(
    n = n(),
    mean = mean(phq4_sum),
    sd   = sd(phq4_sum),
    se   = ifelse(n > 1, sd/sqrt(n), NA_real_),
    lo   = mean - 1.96*se,
    hi   = mean + 1.96*se,
    .groups = "drop"
  ) %>%
  arrange(mean) %>%
  mutate(country = factor(country, levels = country))

overall_phq4_mean <- mean(df_gmh$phq4_sum, na.rm = TRUE)

p_phq4_country <- ggplot(phq4_country, aes(x = mean, y = fct_rev(country))) +
  geom_vline(xintercept = overall_phq4_mean, linetype = "dashed", color = "grey40") +
  geom_vline(xintercept = c(3,6,9), linetype = "dotted", color = "grey60") +
  geom_errorbarh(aes(xmin = lo, xmax = hi), height = 0.2) +
  geom_point(size = 2) +
  scale_x_continuous(limits = range(c(phq4_country$lo, phq4_country$hi), na.rm = TRUE),
                     expand = expansion(mult = c(0.00, 0.04))) +
  labs(title = "PHQ-4 mean by country",
       x = "PHQ-4 (0–12), mean ± 95% CI", y = NULL) +
  theme_minimal(base_size = 12) +
  theme(panel.grid.minor = element_blank())

Figure 6

# Plot A: Ridge plot of MPWB by PHQ-4
df_phq <- df_phq |>
  dplyr::mutate(
    phq4_cat2 = base::factor(
      dplyr::case_when(
        phq4_cat == "Normal (0–2)" ~ "0-2\nNormal",
        phq4_cat == "Mild (3–5)" ~ "3-5\nMild",
        phq4_cat == "Moderate (6–8)" ~ "6-8\nModerate",
        phq4_cat == "Severe (9–12)" ~ "9-12\nSevere",
        TRUE ~ phq4_cat
      ),
      levels = c(
        "0-2\nNormal",
        "3-5\nMild",
        "6-8\nModerate",
        "9-12\nSevere"
      )
  )
)

# Palette
phq4_cols <- c(
  "0-2\nNormal" = "#E6EEF7",
  "3-5\nMild" = "#C9DBF0",
  "6-8\nModerate" = "#9FBDE0",
  "9-12\nSevere" = "#5B88C8"
)

p_ridge <- 
  # We are going to pass unweighted mpwb_sum to the aes because the weights 
  # are applied within geom_density_ridges.
  ggplot(df_phq, aes(x = mpwb_sum, y = phq4_cat2, fill = phq4_cat2)) +
  geom_density_ridges(
    aes(weight = ps_weight),
    scale = 1.5, rel_min_height = 0.01, alpha = 0.9, color = "white"
  ) +
  scale_x_continuous(
    limits = c(0, 80),
    breaks = seq(0, 70, by = 10),
    expand = c(0, 0)
  ) +
  scale_fill_manual(values = phq4_cols) +
  coord_cartesian(xlim = c(5, 80)) +
  labs(
    x = "MPWB Sum",
    y = "PHQ-4\n"
  ) +
  theme(
    axis.line.x = element_blank(),
    axis.text.y  = element_text(color = "#051520", margin = margin(t = 1), face = "bold"),
    legend.position = "none",
    panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.title.y = element_text(margin = margin(r = 10), color = "#051520", face = "bold"),
    axis.title.x = element_text(margin = margin(t = 10), color = "#051520", face = "bold")
  ) +
  geom_vline(xintercept = 40, linetype = "dashed", color = "#6F7C91", linewidth = 0.6, alpha = 0.9)

# Histogram of MPWB sum
ggplot(df_phq, aes(x = mpwb_sum)) +
  geom_histogram(binwidth = 1, color = "#5B88C8", fill = "#9FBDE0") +
  labs(
    x = "MPWB sum",
    y = "Frequency"
  )

# Plot B: PHQ-4 and MPWB by country
svy_phq <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_phq)

means_cty <- survey::svyby(
  ~ phq4_sum_rec + mpwb_sum,
  ~ country,
  svy_phq,
  survey::svymean,
  na.rm = TRUE,
  vartype = NULL
) |>
  as.data.frame()

weights_cty <- df_phq |>
  dplyr::group_by(country, iso2) |>
  dplyr::summarise(ps_weight = sum(ps_weight, na.rm = TRUE), .groups = "drop") |>
  dplyr::mutate(
    iso2 = tolower(iso2),
    alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1)
  )

means_cty <- means_cty |>
  dplyr::left_join(weights_cty, by = "country")

r_w_val <- weighted_corr(means_cty, phq4_sum_rec, mpwb_sum)[[1]]

p_phq4_mpwb <-
  ggplot(means_cty, aes(x = phq4_sum_rec, y = mpwb_sum)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "#6F7C91",
    linewidth = 0.8
  ) +
  labs(
    subtitle = bquote("Pearson's " ~ italic(r) ~ "=" ~ .(r_w_val)),
    x = "PHQ-4",
    y = "MPWB Sum"
  ) +
  geom_point(
    aes(alpha = alpha_country),
    shape = 21,
    colour = "#051520",
    size = 4
  ) +
  with_shadow(
    geom_point(
      aes(alpha = alpha_country),
      size = 4.2,
      alpha = 0.5,
      stroke = 0
    ),
    sigma = 2,
    colour = "gray60",
    x_offset = 1,
    y_offset = 1
  ) +
  ggflags::geom_flag(aes(country = iso2), size = 3.5, na.rm = TRUE) +
  theme(
    plot.subtitle = element_text(color = "#051520", family = ""),
    legend.position = "none",
    axis.line.x = element_blank(),
    panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.title.x = element_text(
      margin = margin(t = 10),
      color = "#051520",
      face = "bold"
    ),
    axis.title.y = element_text(
      margin = margin(r = 10),
      color = "#051520",
      face = "bold"
    ),
    axis.text.x  = element_text(
      color = "#051520",
      margin = margin(t = 1),
      face = "plain"
    )
  ) +
  guides(alpha = "none") + coord_flip()
cowplot::plot_grid(
  p_ridge, p_phq4_mpwb,
  labels = c("A", "B"),
  label_size = 14,
  ncol = 2,
  rel_widths = c(1, 1)
)

Correlation between PHQ-2 and GAD-2

means_cty_gp <- survey::svyby(
  ~ phq2_sum_rec + gad2_sum_rec,
  ~ country,
  svy_phq,
  survey::svymean,
  na.rm = TRUE,
  vartype = NULL
) |>
  as.data.frame()

weights_cty_gp <- df_phq |>
  dplyr::group_by(country, iso2) |>
  dplyr::summarise(ps_weight = sum(ps_weight, na.rm = TRUE), .groups = "drop") |>
  dplyr::mutate(
    iso2 = tolower(iso2),
    alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1)
  )

means_cty_gp <- means_cty_gp |>
  dplyr::left_join(weights_cty, by = "country")

r_w_val <- weighted_corr(means_cty_gp, phq2_sum_rec, gad2_sum_rec)[[1]]

p_phq2_gad2 <-
  ggplot(means_cty_gp, aes(x = phq2_sum_rec, y = gad2_sum_rec)) +
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "#6F7C91",
    linewidth = 0.8
  ) +
  labs(
    subtitle = bquote("Pearson's " ~ italic(r) ~ "=" ~ .(r_w_val)),
    x = "PHQ-2",
    y = "GAD2"
  ) +
  geom_point(
    aes(alpha = alpha_country),
    shape = 21,
    colour = "#051520",
    size = 4
  ) +
  with_shadow(
    geom_point(
      aes(alpha = alpha_country),
      size = 4.2,
      alpha = 0.5,
      stroke = 0
    ),
    sigma = 2,
    colour = "gray60",
    x_offset = 1,
    y_offset = 1
  ) +
  ggflags::geom_flag(aes(country = iso2), size = 3.5, na.rm = TRUE) +
  theme(
    plot.subtitle = element_text(color = "#051520", family = ""),
    legend.position = "none",
    axis.line.x = element_blank(),
    panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
    axis.title.x = element_text(
      margin = margin(t = 10),
      color = "#051520",
      face = "bold"
    ),
    axis.title.y = element_text(
      margin = margin(r = 10),
      color = "#051520",
      face = "bold"
    ),
    axis.text.x  = element_text(
      color = "#051520",
      margin = margin(t = 1),
      face = "plain"
    )
  ) +
  guides(alpha = "none") + coord_flip()
p_phq2_gad2

Information About the R Session

sessioninfo::session_info()
─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31)
 os       macOS Sequoia 15.6
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Amsterdam
 date     2025-12-15
 pandoc   3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
 quarto   1.4.549 @ /usr/local/bin/quarto

─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 package           * version    date (UTC) lib source
 abind               1.4-8      2024-09-12 [1] CRAN (R 4.5.0)
 archive             1.1.12     2025-03-20 [1] CRAN (R 4.5.0)
 askpass             1.2.1      2024-10-04 [1] CRAN (R 4.5.0)
 backports           1.5.0      2024-05-23 [1] CRAN (R 4.5.0)
 base64enc           0.1-3      2015-07-28 [1] CRAN (R 4.5.0)
 binom             * 1.1-1.1    2022-05-02 [1] CRAN (R 4.5.0)
 bit                 4.6.0      2025-03-06 [1] CRAN (R 4.5.0)
 bit64               4.6.0-1    2025-01-16 [1] CRAN (R 4.5.0)
 boot                1.3-32     2025-08-29 [1] CRAN (R 4.5.2)
 broom               1.0.9      2025-07-28 [1] CRAN (R 4.5.0)
 broom.mixed       * 0.2.9.6    2024-10-15 [1] CRAN (R 4.5.0)
 car               * 3.1-3      2024-09-27 [1] CRAN (R 4.5.0)
 carData           * 3.0-5      2022-01-06 [1] CRAN (R 4.5.0)
 cellranger          1.1.0      2016-07-27 [1] CRAN (R 4.5.0)
 checkmate           2.3.2      2024-07-29 [1] CRAN (R 4.5.0)
 chromote            0.5.1      2025-04-24 [1] CRAN (R 4.5.0)
 class               7.3-23     2025-01-01 [1] CRAN (R 4.5.2)
 classInt            0.4-11     2025-01-08 [1] CRAN (R 4.5.0)
 cli                 3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
 cluster             2.1.8.1    2025-03-12 [1] CRAN (R 4.5.2)
 coda                0.19-4.1   2024-01-31 [1] CRAN (R 4.5.0)
 codetools           0.2-20     2024-03-31 [1] CRAN (R 4.5.2)
 colorspace          2.1-1      2024-07-26 [1] CRAN (R 4.5.0)
 commonmark          2.0.0      2025-07-07 [1] CRAN (R 4.5.0)
 corrplot          * 0.95       2024-10-14 [1] CRAN (R 4.5.0)
 countrycode       * 1.6.1      2025-03-31 [1] CRAN (R 4.5.0)
 cowplot           * 1.2.0      2025-07-07 [1] CRAN (R 4.5.0)
 crayon              1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
 crosstalk           1.2.1      2023-11-23 [1] CRAN (R 4.5.0)
 curl                7.0.0      2025-08-19 [1] CRAN (R 4.5.0)
 data.table          1.17.8     2025-07-10 [1] CRAN (R 4.5.0)
 DBI                 1.2.3      2024-06-02 [1] CRAN (R 4.5.0)
 digest              0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
 dplyr             * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
 e1071               1.7-16     2024-09-16 [1] CRAN (R 4.5.0)
 emmeans           * 1.11.2     2025-07-11 [1] CRAN (R 4.5.0)
 estimability        1.5.1      2024-05-12 [1] CRAN (R 4.5.0)
 evaluate            1.0.4      2025-06-18 [1] CRAN (R 4.5.0)
 farver              2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
 fastmap             1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
 flextable         * 0.9.10     2025-08-24 [1] CRAN (R 4.5.0)
 fontBitstreamVera   0.1.1      2017-02-01 [1] CRAN (R 4.5.0)
 fontLiberation      0.1.0      2016-10-15 [1] CRAN (R 4.5.0)
 fontquiver          0.2.1      2017-02-01 [1] CRAN (R 4.5.0)
 forcats           * 1.0.0      2023-01-29 [1] CRAN (R 4.5.0)
 foreach             1.5.2      2022-02-02 [1] CRAN (R 4.5.0)
 foreign             0.8-90     2025-03-31 [1] CRAN (R 4.5.2)
 Formula             1.2-5      2023-02-24 [1] CRAN (R 4.5.0)
 fs                  1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
 furrr               0.3.1      2022-08-15 [1] CRAN (R 4.5.0)
 future              1.67.0     2025-07-29 [1] CRAN (R 4.5.0)
 gdata               3.0.1      2024-10-22 [1] CRAN (R 4.5.0)
 gdtools             0.4.4      2025-10-06 [1] CRAN (R 4.5.0)
 generics            0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
 ggeffects         * 2.3.0      2025-06-13 [1] CRAN (R 4.5.0)
 ggflags           * 0.0.4      2023-10-10 [1] https://jimjam-slam.r-universe.dev (R 4.5.1)
 ggfx              * 1.0.2      2025-07-24 [1] CRAN (R 4.5.0)
 ggh4x             * 0.3.1      2025-05-30 [1] CRAN (R 4.5.0)
 ggplot2           * 4.0.0      2025-09-11 [1] CRAN (R 4.5.0)
 ggplotify         * 0.1.2      2023-08-09 [1] CRAN (R 4.5.0)
 ggridges          * 0.5.7      2025-08-27 [1] CRAN (R 4.5.0)
 ggtext            * 0.1.2      2022-09-16 [1] CRAN (R 4.5.0)
 glmnet              4.1-10     2025-07-17 [1] CRAN (R 4.5.0)
 globals             0.18.0     2025-05-08 [1] CRAN (R 4.5.0)
 glue                1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
 gridExtra         * 2.3        2017-09-09 [1] CRAN (R 4.5.0)
 gridGraphics        0.5-1      2020-12-13 [1] CRAN (R 4.5.0)
 gridtext            0.1.5      2022-09-16 [1] CRAN (R 4.5.0)
 grImport2           0.3-3      2024-07-30 [1] CRAN (R 4.5.0)
 gt                  1.0.0      2025-04-05 [1] CRAN (R 4.5.0)
 gtable            * 0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
 gtools              3.9.5      2023-11-20 [1] CRAN (R 4.5.0)
 haven               2.5.5      2025-05-30 [1] CRAN (R 4.5.0)
 Hmisc             * 5.2-4      2025-10-05 [1] CRAN (R 4.5.0)
 hms                 1.1.3      2023-03-21 [1] CRAN (R 4.5.0)
 htmlTable           2.4.3      2024-07-21 [1] CRAN (R 4.5.0)
 htmltools         * 0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
 htmlwidgets         1.6.4      2023-12-06 [1] CRAN (R 4.5.0)
 httpuv              1.6.16     2025-04-16 [1] CRAN (R 4.5.0)
 httr                1.4.7      2023-08-15 [1] CRAN (R 4.5.0)
 insight             1.4.2      2025-09-02 [1] CRAN (R 4.5.0)
 interactions      * 1.2.0      2024-07-29 [1] CRAN (R 4.5.0)
 iterators           1.0.14     2022-02-05 [1] CRAN (R 4.5.0)
 janitor           * 2.2.1      2024-12-22 [1] CRAN (R 4.5.0)
 jomo                2.7-6      2023-04-15 [1] CRAN (R 4.5.0)
 jpeg                0.1-11     2025-03-21 [1] CRAN (R 4.5.0)
 jquerylib           0.1.4      2021-04-26 [1] CRAN (R 4.5.0)
 jsonlite            2.0.0      2025-03-27 [1] CRAN (R 4.5.0)
 jtools              2.3.0      2024-08-25 [1] CRAN (R 4.5.0)
 kableExtra        * 1.4.0      2024-01-24 [1] CRAN (R 4.5.0)
 KernSmooth          2.23-26    2025-01-01 [1] CRAN (R 4.5.2)
 knitr               1.50       2025-03-16 [1] CRAN (R 4.5.0)
 labeling            0.4.3      2023-08-29 [1] CRAN (R 4.5.0)
 labelled          * 2.16.0     2025-10-22 [1] CRAN (R 4.5.0)
 later               1.4.2      2025-04-08 [1] CRAN (R 4.5.0)
 lattice             0.22-7     2025-04-02 [1] CRAN (R 4.5.2)
 lavaan            * 0.6-19     2024-09-26 [1] CRAN (R 4.5.0)
 lazyeval            0.2.2      2019-03-15 [1] CRAN (R 4.5.0)
 leaflet           * 2.2.2      2024-03-26 [1] CRAN (R 4.5.0)
 leaflet.extras    * 2.0.1      2024-08-19 [1] CRAN (R 4.5.0)
 leaflet.extras2   * 1.3.2      2025-08-27 [1] CRAN (R 4.5.0)
 leaflet.providers   2.0.0      2023-10-17 [1] CRAN (R 4.5.0)
 lifecycle           1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
 listenv             0.9.1      2024-01-29 [1] CRAN (R 4.5.0)
 litedown            0.7        2025-04-08 [1] CRAN (R 4.5.0)
 lme4              * 1.1-37     2025-03-26 [1] CRAN (R 4.5.0)
 lsr               * 0.5.2      2021-12-01 [1] CRAN (R 4.5.0)
 lubridate         * 1.9.4      2024-12-08 [1] CRAN (R 4.5.0)
 magick              2.8.7      2025-06-06 [1] CRAN (R 4.5.0)
 magrittr            2.0.4      2025-09-12 [1] CRAN (R 4.5.0)
 markdown            2.0        2025-03-23 [1] CRAN (R 4.5.0)
 MASS                7.3-65     2025-02-28 [1] CRAN (R 4.5.2)
 mathjaxr            2.0-0      2025-12-01 [1] CRAN (R 4.5.2)
 Matrix            * 1.7-4      2025-08-28 [1] CRAN (R 4.5.2)
 metadat           * 1.4-0      2025-02-04 [1] CRAN (R 4.5.0)
 metafor           * 4.8-0      2025-01-28 [1] CRAN (R 4.5.0)
 MetBrewer         * 0.2.0      2022-03-21 [1] CRAN (R 4.5.0)
 mgcv              * 1.9-3      2025-04-04 [1] CRAN (R 4.5.2)
 mice                3.18.0     2025-05-27 [1] CRAN (R 4.5.0)
 mime                0.13       2025-03-17 [1] CRAN (R 4.5.0)
 minqa               1.2.8      2024-08-17 [1] CRAN (R 4.5.0)
 mitml               0.4-5      2023-03-08 [1] CRAN (R 4.5.0)
 mitools             2.4        2019-04-26 [1] CRAN (R 4.5.0)
 mnormt              2.1.1      2022-09-26 [1] CRAN (R 4.5.0)
 multcomp            1.4-28     2025-01-29 [1] CRAN (R 4.5.0)
 mvtnorm             1.3-3      2025-01-10 [1] CRAN (R 4.5.0)
 nlme              * 3.1-168    2025-03-31 [1] CRAN (R 4.5.2)
 nloptr              2.2.1      2025-03-17 [1] CRAN (R 4.5.0)
 nnet                7.3-20     2025-01-01 [1] CRAN (R 4.5.2)
 numDeriv          * 2016.8-1.1 2019-06-06 [1] CRAN (R 4.5.0)
 officer           * 0.7.0      2025-09-03 [1] CRAN (R 4.5.0)
 openssl             2.3.3      2025-05-26 [1] CRAN (R 4.5.0)
 pacman            * 0.5.1      2019-03-11 [1] CRAN (R 4.5.0)
 pagedown            0.23       2025-08-20 [1] CRAN (R 4.5.0)
 pan                 1.9        2023-12-07 [1] CRAN (R 4.5.0)
 pander              0.6.6      2025-03-01 [1] CRAN (R 4.5.0)
 parallelly          1.45.1     2025-07-24 [1] CRAN (R 4.5.0)
 pbivnorm            0.6.0      2015-01-23 [1] CRAN (R 4.5.0)
 performance       * 0.15.2     2025-10-06 [1] CRAN (R 4.5.0)
 pillar              1.11.0     2025-07-04 [1] CRAN (R 4.5.0)
 pkgconfig           2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
 plotly              4.11.0     2025-06-19 [1] CRAN (R 4.5.0)
 png                 0.1-8      2022-11-29 [1] CRAN (R 4.5.0)
 processx            3.8.6      2025-02-21 [1] CRAN (R 4.5.0)
 promises            1.3.3      2025-05-29 [1] CRAN (R 4.5.0)
 proxy               0.4-27     2022-06-09 [1] CRAN (R 4.5.0)
 ps                  1.9.1      2025-04-12 [1] CRAN (R 4.5.0)
 psych             * 2.5.6      2025-06-23 [1] CRAN (R 4.5.0)
 purrr             * 1.1.0      2025-07-10 [1] CRAN (R 4.5.0)
 quadprog            1.5-8      2019-11-20 [1] CRAN (R 4.5.0)
 qualtRics         * 3.2.1      2024-08-16 [1] CRAN (R 4.5.0)
 R6                  2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
 ragg                1.4.0      2025-04-10 [1] CRAN (R 4.5.0)
 rappdirs            0.3.3      2021-01-31 [1] CRAN (R 4.5.0)
 rbibutils           2.3        2024-10-04 [1] CRAN (R 4.5.0)
 RColorBrewer        1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
 Rcpp                1.1.0      2025-07-02 [1] CRAN (R 4.5.0)
 Rdpack              2.6.4      2025-04-09 [1] CRAN (R 4.5.0)
 reactable         * 0.4.4      2023-03-12 [1] CRAN (R 4.5.0)
 reactR              0.6.1      2024-09-14 [1] CRAN (R 4.5.0)
 readr             * 2.1.5      2024-01-10 [1] CRAN (R 4.5.0)
 readxl            * 1.4.5      2025-03-07 [1] CRAN (R 4.5.0)
 reformulas          0.4.1      2025-04-30 [1] CRAN (R 4.5.0)
 report            * 0.6.1      2025-02-07 [1] CRAN (R 4.5.0)
 rlang             * 1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
 rmarkdown           2.29       2024-11-04 [1] CRAN (R 4.5.0)
 rmcorr            * 0.7.0      2024-07-26 [1] CRAN (R 4.5.0)
 rnaturalearth     * 1.1.0      2025-07-28 [1] CRAN (R 4.5.0)
 rnaturalearthdata * 1.0.0      2024-02-09 [1] CRAN (R 4.5.0)
 rpart               4.1.24     2025-01-07 [1] CRAN (R 4.5.2)
 rstudioapi          0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
 S7                  0.2.0      2024-11-07 [1] CRAN (R 4.5.0)
 sandwich            3.1-1      2024-09-15 [1] CRAN (R 4.5.0)
 sass                0.4.10     2025-04-11 [1] CRAN (R 4.5.0)
 scales            * 1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
 see               * 0.11.0     2025-03-11 [1] CRAN (R 4.5.0)
 semTools          * 0.5-7      2025-03-13 [1] CRAN (R 4.5.0)
 servr               0.32       2024-10-04 [1] CRAN (R 4.5.0)
 sessioninfo       * 1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
 sf                * 1.0-21     2025-05-15 [1] CRAN (R 4.5.0)
 shape               1.4.6.1    2024-02-23 [1] CRAN (R 4.5.0)
 showtext          * 0.9-7      2024-03-02 [1] CRAN (R 4.5.0)
 showtextdb        * 3.0        2020-06-04 [1] CRAN (R 4.5.0)
 sjlabelled          1.2.0      2022-04-10 [1] CRAN (R 4.5.0)
 sjPlot            * 2.9.0      2025-07-10 [1] CRAN (R 4.5.0)
 snakecase           0.11.1     2023-08-27 [1] CRAN (R 4.5.0)
 srvyr               1.3.0      2024-08-19 [1] CRAN (R 4.5.0)
 stringi             1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
 stringr           * 1.5.1      2023-11-14 [1] CRAN (R 4.5.0)
 survey            * 4.4-8      2025-08-28 [1] CRAN (R 4.5.0)
 survival          * 3.8-3      2024-12-17 [1] CRAN (R 4.5.2)
 svglite             2.2.1      2025-05-12 [1] CRAN (R 4.5.0)
 sysfonts          * 0.8.9      2024-03-02 [1] CRAN (R 4.5.0)
 systemfonts         1.3.1      2025-10-01 [1] CRAN (R 4.5.0)
 textshaping         1.0.1      2025-05-01 [1] CRAN (R 4.5.0)
 TH.data             1.1-3      2025-01-17 [1] CRAN (R 4.5.0)
 tibble            * 3.3.0      2025-06-08 [1] CRAN (R 4.5.0)
 tidyr             * 1.3.1      2024-01-24 [1] CRAN (R 4.5.0)
 tidyselect          1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
 timechange          0.3.0      2024-01-18 [1] CRAN (R 4.5.0)
 tzdb                0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
 units               0.8-7      2025-03-11 [1] CRAN (R 4.5.0)
 utf8                1.2.6      2025-06-08 [1] CRAN (R 4.5.0)
 uuid                1.2-1      2024-07-29 [1] CRAN (R 4.5.0)
 vctrs               0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
 viridisLite         0.4.2      2023-05-02 [1] CRAN (R 4.5.0)
 visdat            * 0.6.0      2023-02-02 [1] CRAN (R 4.5.0)
 vroom               1.6.5      2023-12-05 [1] CRAN (R 4.5.0)
 webshot2            0.1.2      2025-04-23 [1] CRAN (R 4.5.0)
 websocket           1.4.4      2025-04-10 [1] CRAN (R 4.5.0)
 weights           * 1.1.2      2025-06-18 [1] CRAN (R 4.5.0)
 withr               3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
 xfun                0.52       2025-04-02 [1] CRAN (R 4.5.0)
 XML                 3.99-0.19  2025-08-22 [1] CRAN (R 4.5.0)
 xml2                1.3.8      2025-03-14 [1] CRAN (R 4.5.0)
 xtable              1.8-4      2019-04-21 [1] CRAN (R 4.5.0)
 yaml                2.3.10     2024-07-26 [1] CRAN (R 4.5.0)
 yulab.utils         0.2.1      2025-08-19 [1] CRAN (R 4.5.0)
 zip                 2.3.3      2025-05-13 [1] CRAN (R 4.5.0)
 zoo                 1.8-14     2025-04-10 [1] CRAN (R 4.5.0)

 [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
 * ── Packages attached to the search path.

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────